reduceByKey with multiple values - pyspark

Each value in my rdd is a tuple:
temp = clustering.map(lambda x: (x[0][0], (1,1)))
temp.take(10)
[(0, (1, 1)),
(0, (1, 1)),
(6, (1, 1)),
(0, (1, 1)),
(0, (1, 1)),
(0, (1, 1)),
(0, (1, 1)),
(0, (1, 1)),
(7, (1, 1)),
(0, (1, 1))]
then tried to reduce it by key:
temp.reduceByKey(lambda a,b: (a[1]+b[1])).collect()
and got this error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-33-237802845981> in <module>
----> 1 temp.reduceByKey(lambda a,b: (a[1]+b[1])).collect()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py in collect(self)
814 """
815 with SCCallSiteSync(self.context) as css:
--> 816 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
817 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
818
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\java_gateway.py in __call__(self, *args)
1255 answer = self.gateway_client.send_command(command)
1256 return_value = get_return_value(
-> 1257 answer, self.gateway_client, self.target_id, self.name)
1258
1259 for temp_arg in temp_args:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
--> 328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 35.0 failed 1 times, most recent failure: Lost task 0.0 in stage 35.0 (TID 28, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 377, in main
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 372, in process
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 352, in func
return f(iterator)
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 1861, in combineLocally
merger.mergeValues(iterator)
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\shuffle.py", line 240, in mergeValues
d[k] = comb(d[k], v) if k in d else creator(v)
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-33-237802845981>", line 1, in <lambda>
TypeError: 'int' object is not subscriptable
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:238)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 377, in main
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 372, in process
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 2499, in pipeline_func
return func(split, prev_func(split, iterator))
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 352, in func
return f(iterator)
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\rdd.py", line 1861, in combineLocally
merger.mergeValues(iterator)
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\shuffle.py", line 240, in mergeValues
d[k] = comb(d[k], v) if k in d else creator(v)
File "C:\Users\helmis\AppData\Local\Continuum\anaconda3\lib\site-packages\pyspark\util.py", line 99, in wrapper
return f(*args, **kwargs)
File "<ipython-input-33-237802845981>", line 1, in <lambda>
TypeError: 'int' object is not subscriptable
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:452)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:588)
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:571)
at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:406)
at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
at scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:1124)
at scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:1130)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:409)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55)
at org.apache.spark.scheduler.Task.run(Task.scala:121)
at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Tried this:
temp.reduceByKey(lambda a,b: print(a)).collect()
and got:
[(0, None),
(6, None),
(7, None),
(8, None),
(5, None),
(3, None),
(9, None),
(1, (1, 1)),
(2, (1, 1)),
(4, None)]
So, don't know what's wrong! Your help is appreciated.
What I'm trying to ultimately get is:
temp.reduceByKey(lambda a,b: (a[0] + b[0], a[1] + b[1])).collect()

In your code there are two values, but you want aggregation of value-2 alone. Hence, you have written your code as below:
temp.reduceByKey(lambda a,b: (a[1]+b[1])).collect()
Reason For Error:
Since position of value-1 is missing in your code its throwing error. You must specify value-1 (a[0]) in your code i.e. as below:
temp.reduceByKey(lambda a,b: (a[0], a[1] + b[1])).take(20)

Related

AttributeError: 'HashingTF' object has no attribute '_java_obj'

when I use the pyspark.ml.Pipline to create pipline, it occurs the following problem:
File "/opt/module/spark-2.4.3-bin-hadoop2.7/Pipeline.py", line 18, in
hashingTF = HashingTF(ipnutCol=tokenizer.getOutputCol(),outputCol="features")
File "/opt/module/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/init.py", line 110
, in wrapper
TypeError: init() got an unexpected keyword argument 'ipnutCol'
Exception ignored in:
Traceback (most recent call last):
File "/opt/module/spark-2.4.3-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/ml/wrapper.py", line 4
0, in del
AttributeError: 'HashingTF' object has no attribute '_java_obj'
I guess the API have changed, but I am not certain.
# 构建一个机器学习流水线
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml import Pipeline
# 创建一个SparkSession对象
spark = SparkSession.builder.master("local").appName("WorldCount").getOrCreate()
# 1. prepare training documents from a list of (id, text, label) tuples
training = spark.createDataFrame([
(0, 'a b c d e spark', 1.0),
(1, 'b d', 0.0),
(2, 'spark f g h', 1.0),
(3, 'hadoop mapreduce', 0.0)
],['id','text','label'])
# 2. 定义pipline 中各个流水线阶段PipelineStage.
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(ipnutCol=tokenizer.getOutputCol(),outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001)
# 3. 按照具体的处理逻辑有序地组织PipelineStages,并创建一个Pipeline.
pipeline = Pipeline(stages=[tokenizer,hashingTF,lr])
# 4. 训练模型
model = pipeline.fit(training)
# 5. 构建测试数据
test = spark.createDataFrame([
(4, 'spark i j k'),
(5, 'i m n'),
(6, 'spark hadoop spark'),
(7, 'apache hadoop')
],['id', 'text'])
# 6. 调用之前训练好的PipelineModel的transform()方法,
# 让测试数据按照顺序通过拟合的流水线,生成预测结果
prediction = model.transform(test)
selected = prediction.select('id','text','probability','prediction')
for row in selected.collect():
rid, text, prob, prediction = row
print('({},{}) -> prob = {}, prediction={}'.format(rid, text, str(prob),prediction))
(4, spark i j k) --> prob=[0.155543713844,0.844456286156], prediction=1.000000 (5, l m n) --> prob=[0.830707735211,0.169292264789], prediction=0.000000 (6, spark hadoop spark) --> prob=[0.0696218406195,0.93037815938], prediction=1.000000 (7, apache hadoop) --> prob=[0.981518350351,0.018481649649], prediction=0.000000
Your spelling of input is wrong here (ipnutCol):
TypeError: init() got an unexpected keyword argument 'ipnutCol'

Nested Looping over dynamic list of list in scala [duplicate]

Given the following list:
List(List(1,2,3), List(4,5))
I would like to generate all the possible combinations. Using yield, it can be done as follows:
scala> for (x <- l.head; y <- l.last) yield (x,y)
res17: List[(Int, Int)] = List((1,4), (1,5), (2,4), (2,5), (3,4), (3,5))
But the problem I have is that the List[List[Int]] is not fixed; it can grow and shrink in size, so I never know how many for loops I will need in advance. What I would like is to be able to pass that list into a function which will dynamically generate the combinations regardless of the number of lists I have, so:
def generator (x : List[List[Int]) : List[List[Int]]
Is there a built-in library function that can do this. If not how do I go about doing this. Any pointers and hints would be great.
UPDATE:
The answer by #DNA blows the heap with the following (not so big) nested List structure:
List(
List(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240, 245, 250, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300),
List(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300),
List(0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300),
List(0, 50, 100, 150, 200, 250, 300),
List(0, 100, 200, 300),
List(0, 200),
List(0)
)
Calling the generator2 function as follows:
generator2(
List(
List(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180, 185, 190, 195, 200, 205, 210, 215, 220, 225, 230, 235, 240, 245, 250, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300),
List(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300),
List(0, 20, 40, 60, 80, 100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300),
List(0, 50, 100, 150, 200, 250, 300),
List(0, 100, 200, 300),
List(0, 200),
List(0)
)
)
Is there a way to generate the cartesian product without blowing the heap?
Exception in thread "main" java.lang.OutOfMemoryError: Java heap space
at scala.LowPriorityImplicits.wrapRefArray(LowPriorityImplicits.scala:73)
at recfun.Main$.recfun$Main$$generator$1(Main.scala:82)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:251)
at scala.collection.AbstractTraversable.flatMap(Traversable.scala:105)
at recfun.Main$.recfun$Main$$generator$1(Main.scala:83)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:251)
at scala.collection.AbstractTraversable.flatMap(Traversable.scala:105)
at recfun.Main$.recfun$Main$$generator$1(Main.scala:83)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:251)
at scala.collection.AbstractTraversable.flatMap(Traversable.scala:105)
at recfun.Main$.recfun$Main$$generator$1(Main.scala:83)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at recfun.Main$$anonfun$recfun$Main$$generator$1$1.apply(Main.scala:83)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:251)
at scala.collection.immutable.List.foreach(List.scala:318)
at scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:251)
Here's a recursive solution:
def generator(x: List[List[Int]]): List[List[Int]] = x match {
case Nil => List(Nil)
case h :: _ => h.flatMap(i => generator(x.tail).map(i :: _))
}
which produces:
val a = List(List(1, 2, 3), List(4, 5))
val b = List(List(1, 2, 3), List(4, 5), List(6, 7))
generator(a) //> List(List(1, 4), List(1, 5), List(2, 4),
//| List(2, 5), List(3, 4), List(3, 5))
generator(b) //> List(List(1, 4, 6), List(1, 4, 7), List(1, 5, 6),
//| List(1, 5, 7), List(2, 4, 6), List(2, 4, 7),
//| List(2, 5, 6), List(2, 5, 7), Listt(3, 4, 6),
//| List(3, 4, 7), List(3, 5, 6), List(3, 5, 7))
Update: the second case can also be written as a for comprehension, which may be a little clearer:
def generator2(x: List[List[Int]]): List[List[Int]] = x match {
case Nil => List(Nil)
case h :: t => for (j <- generator2(t); i <- h) yield i :: j
}
Update 2: for larger datasets, if you run out of memory, you can use Streams instead (if it makes sense to process the results incrementally). For example:
def generator(x: Stream[Stream[Int]]): Stream[Stream[Int]] =
if (x.isEmpty) Stream(Stream.empty)
else x.head.flatMap(i => generator(x.tail).map(i #:: _))
// NB pass in the data as Stream of Streams, not List of Lists
generator(input).take(3).foreach(x => println(x.toList))
>List(0, 0, 0, 0, 0, 0, 0)
>List(0, 0, 0, 0, 0, 200, 0)
>List(0, 0, 0, 0, 100, 0, 0)
Feels like your problem can be described in terms of recursion:
If you have n lists of int:
list1 of size m and list2, ... list n
generate the X combinations for list2 to n (so n-1 lists)
for each combination, you generate m new ones for each value of list1.
the base case is a list of one list of int, you just split all the elements in singleton lists.
so with List(List(1,2), List(3), List(4, 5))
the result of your recursive call is List(List(3,4),List(3,5)) and for each you add 2 combinations: List(1,3,4), List(2,3,4), List(1,3,5), List(2,3,5).
Ezekiel has exactly what I was looking for. This is just a minor tweak of it to make it generic.
def generateCombinations[T](x: List[List[T]]): List[List[T]] = {
x match {
case Nil => List(Nil)
case h :: _ => h.flatMap(i => generateCombinations(x.tail).map(i :: _))
}
}
Here is another solution based on Ezekiel's one. More verbose, but it's tail recursion (stack-safe).
def generateCombinations[A](in: List[List[A]]): List[List[A]] =
generate(in, List.empty)
#tailrec
private def generate[A](in: List[List[A]], acc: List[List[A]]): List[List[A]] = in match {
case Nil => acc
case head :: tail => generate(tail, generateAcc(acc, head))
}
private def generateAcc[A](oldAcc: List[List[A]], as: List[A]): List[List[A]] = {
oldAcc match {
case Nil => as.map(List(_))
case nonEmptyAcc =>
for {
a <- as
xs <- nonEmptyAcc
} yield a :: xs
}
}
I realize this is old, but it seems like no other answer provided the non-recursive solution with fold.
def generator[A](xs: List[List[A]]): List[List[A]] = xs.foldRight(List(List.empty[A])) { (next, combinations) =>
for (a <- next; as <- combinations) yield a +: as
}

Iterating through Stream

I'm very new to Scala and I have to find primes for numbers in range [i;j].
This is how I fill stream:
val stream = (i to j).toStream
and then I call this function which supposed to find primes:
def findPrimes(s: Stream[Int]): Stream[Int] = {
println("HEAD: " + s.head)
return findPrimes(s.tail)
// s.head #:: findPrimes(s.tail.filter( _ % s.head != 0 ))
}
Outputting stream to console, I am surprised why am I getting exception:
Exception in thread "main" java.util.NoSuchElementException: head of empty stream
at scala.collection.immutable.Stream$Empty$.head(Stream.scala:1104)
at scala.collection.immutable.Stream$Empty$.head(Stream.scala:1102)
at com.example.anna.app.HelloWorld$.findPrimes(HelloWorld.scala:43)
STREAM SIZE IS 100
at com.example.anna.app.HelloWorld$.prime(HelloWorld.scala:32)
HEAD: 1
at com.example.anna.app.HelloWorld$.delayedEndpoint$com$example$anna$app$HelloWorld$1(HelloWorld.scala:11)
HEAD: 2
HEAD: 3
at com.example.anna.app.HelloWorld$delayedInit$body.apply(HelloWorld.scala:3)
HEAD: 4
at scala.Function0.apply$mcV$sp(Function0.scala:34)
HEAD: 5
HEAD: 6
at scala.Function0.apply$mcV$sp$(Function0.scala:34)
HEAD: 7
at scala.runtime.AbstractFunction0.apply$mcV$sp(AbstractFunction0.scala:12)
HEAD: 8
HEAD: 9
at scala.App.$anonfun$main$1$adapted(App.scala:76)
HEAD: 10
HEAD: 11
at scala.collection.immutable.List.foreach(List.scala:389)
HEAD: 12
at scala.App.main(App.scala:76)
HEAD: 13
at scala.App.main$(App.scala:74)
HEAD: 14
HEAD: 15
at com.example.anna.app.HelloWorld$.main(HelloWorld.scala:3)
HEAD: 16
at com.example.anna.app.HelloWorld.main(HelloWorld.scala)
HEAD: 17
starting from 17 up to end there's no single exception.
And also this commented line doesn't work - it throws same exception.
Any help & advice would be appreciated.
Stream that you pass to findPrimes is of limited size and is getting exhausted when you call findPrimes.
Create infinite stream using Stream.from method.
Scala REPL
scala> :paste
// Entering paste mode (ctrl-D to finish)
def primeStream(s: Stream[Int]): Stream[Int] =
Stream.cons(s.head, primeStream(s.tail filter { _ % s.head != 0 }))
val primes = primeStream(Stream.from(2))
// Exiting paste mode, now interpreting.
primeStream: (s: Stream[Int])Stream[Int]
primes: Stream[Int] = Stream(2, ?)
scala> primes.take(100).toList
res0: List[Int] = List(2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541)
Problem with your code.
scala> val primes = primeStream((1 to 100).toStream)
primes: Stream[Int] = Stream(1, ?)
scala> primes.take(100).toList
java.util.NoSuchElementException: head of empty stream
at scala.collection.immutable.Stream$Empty$.head(Stream.scala:1104)
at scala.collection.immutable.Stream$Empty$.head(Stream.scala:1102)
at .primeStream(<console>:12)
at .$anonfun$primeStream$1(<console>:12)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1169)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1159)
at scala.collection.immutable.Stream.$anonfun$take$2(Stream.scala:789)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1169)
at scala.collection.immutable.Stream$Cons.tail(Stream.scala:1159)
at scala.collection.generic.Growable.loop$1(Growable.scala:54)
at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:58)
at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:50)
at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:186)
at scala.collection.mutable.ListBuffer.$plus$plus$eq(ListBuffer.scala:44)
at scala.collection.TraversableLike.to(TraversableLike.scala:590)
at scala.collection.TraversableLike.to$(TraversableLike.scala:587)
at scala.collection.AbstractTraversable.to(Traversable.scala:104)
at scala.collection.TraversableOnce.toList(TraversableOnce.scala:294)
at scala.collection.TraversableOnce.toList$(TraversableOnce.scala:294)
at scala.collection.AbstractTraversable.toList(Traversable.scala:104)
... 28 elided
Why this throws an exception? The reason for this is the head method called on empty stream in the println statement.
So we can pattern-match this stream to return empty stream when the tail is empty as follows (there are better ways to traverse stream, but I tried not to change your code very much):
def findPrimes(s: Stream[Int]): Stream[Int] = s match {
case Stream.Empty =>
println("END") //consider removing println statements
Stream.Empty
case h #:: tl =>
println(s"HEAD: $h")
h #:: findPrimes(tl)
}

Comparison operator giving wrong answer in python

This is the file
12,221,311,4,5,6,71,22
Output
('L:', ['12', '221', '311', '4'])
('R:', ['5', '6', '71', '22'])
('L,R', '12', '5')
false
******This condition is giving wrong result (if 12<5) its still going in this loop
Code:
def merge(Array,start,end,mid):
A=Array
p=start
q=end
r=mid
size1=q-p+1
size2=r-q
print("size1:", size1)
print("size2:", size2)
L=[None]*size1
R=[None]*size2
#print("L[]", L)
for i in range(size1):
print("i:", i)
L[i]=A[p+i]
print("L:", L)
for j in range(size2):
R[j]=A[q+j+1]
print("R:", R)
#L[size1+1]=99999
#R[size2+1]=99999
i,j=0,0
B=[None]*(r+1)
print("L,R", L[0],R[0])
**#Getting a error here...the condition down is giving wrong answer...**
if L[0]<R[0]:
print("false")
for k in range(p,r+1,1):
print("i:",i)
print("j:",j)
if L[i]<=R[j]:
B[k]=L[i]
i=i+1
else:
B[k]=R[j]
j=j+1
print(B)
def merge_sort(Array,start,end):
p=start
r=end
A=Array
if p<r:
print("P:",p)
print("R:",r)
q=(p+r)/2
print("Q:",q)
#merge_sort(A,p,q)
#merge_sort(A,q+1,r)
merge(A,p,q,r)
def main():
p=0
f = open('input.txt', 'r')
A=[]
#B=[]
for ch in f:
A=ch.split(',')
f.close()
print("Unsorted list")
print(A)
r=len(A)
merge_sort(A,p,r-1)
if __name__=="__main__":
main()
It's because you are comparing strings.
Change the main function to this:
def main():
p=0
f = open('input.txt', 'r')
A=[]
#B=[]
for ch in f:
A=ch.split(',')
f.close()
A = [int(x) for x in A] #<--- Convert values to ints
print("Unsorted list")
print(A)
r=len(A)
merge_sort(A,p,r-1)
New output:
Unsorted list
[12, 221, 311, 4, 5, 6, 71, 22]
('P:', 0)
('R:', 7)
('Q:', 3)
('size1:', 4)
('size2:', 4)
('i:', 0)
('i:', 1)
('i:', 2)
('i:', 3)
('L:', [12, 221, 311, 4])
('R:', [5, 6, 71, 22])
('L,R', 12, 5)
('i:', 0)
('j:', 0)
('i:', 0)
('j:', 1)
('i:', 0)
('j:', 2)
('i:', 1)
('j:', 2)
('i:', 1)
('j:', 3)
('i:', 1)
('j:', 4)
(And then it fails)
Traceback (most recent call last):
File "test.py", line 68, in <module>
main()
File "test.py", line 65, in main
merge_sort(A,p,r-1)
File "test.py", line 51, in merge_sort
merge(A,p,q,r)
File "test.py", line 31, in merge
if L[i]<=R[j]:
IndexError: list index out of range

Creating a Stream from High to Low Range

It's possible to create a Stream by specifying a low and high range:
scala> val x = Stream.range(1, 999)
x: scala.collection.immutable.Stream[Int] = Stream(1, ?)
But if I try to make a Stream in reverse order, I get an empty one.
scala> val y = Stream.range(999, 1)
y: scala.collection.immutable.Stream[Int] = Stream()
Also, if I try to simply reverse x, then the whole stream becomes evaluated.
scala> x.reverse
res16: scala.collection.immutable.Stream[Int] = Stream(998, 997, 996, 995, 994,
993, 992, 991, 990, 989, 988, 987, 986, 985, 984, 983, 982, 981, 980, ...
So, how can I create a Stream with a particular range from hi to low without first evaluating it, and then toStream'ing it?
Stream.range has an overload that takes a step argument:
Stream.range(999, 0, -1)
This will give you a (lazy) stream starting at 999 reaching down until 1 (inclusive)