I am trying list all objects in AWS S3 Buckets with input Bucket Name & Filter Prefix using following code.
import scala.collection.JavaConverters._
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model.ListObjectsV2Request
val bucket_name = "Mybucket"
val fiter_prefix = "Test/a/"
def list_objects(str: String): mutable.Buffer[String] = {
val request : ListObjectsV2Request = new ListObjectsV2Request().withBucketName(bucket_name).withPrefix(str)
var result: ListObjectsV2Result = new ListObjectsV2Result()
do {
result = s3_client.listObjectsV2(request)
val token = result.getNextContinuationToken
System.out.println("Next Continuation Token: " + token)
I have applied continuation method but i am just getting last object list. for example is prefix has 2210 objects i am getting back 210 objects only.
listObjectsV2 returns some or all (up to 1,000) of the objects in a bucket as it is stated here. You need to use Continuation Token to iterate rest of the objects in the bucket.
There is an example code here for java.
This is the code which worked for me.
import scala.collection.JavaConverters._
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model.ListObjectsV2Request
val bucket_name = "Mybucket"
val fiter_prefix = "Test/a/"
def list_objects(str: String): List[String] = {
val s3_client = new AmazonS3Client
var final_list: List[String] = List()
var list: List[String] = List()
val request: ListObjectsV2Request = new ListObjectsV2Request().withBucketName(bucket_name).withPrefix(str)
var result: ListObjectsV2Result = new ListObjectsV2Result()
do {
result = s3_client.listObjectsV2(request)
val token = result.getNextContinuationToken
System.out.println("Next Continuation Token: " + token)
list = (result.getObjectSummaries.asScala.map(_.getKey)).toList
final_list = final_list ::: list
} while (result.isTruncated)
println("size", final_list.size)
A solution using vanilla Scala avoiding vars and tail recursion:
import software.amazon.awssdk.regions.Region
import software.amazon.awssdk.services.s3.S3Client
import software.amazon.awssdk.services.s3.model.{ListObjectsV2Request,
import scala.annotation.tailrec
import scala.collection.JavaConverters.asScalaBufferConverter
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
val sourceBucket = "yourbucket"
val sourceKey = "yourKey"
val subFolderPrefix = "yourprefix"
def getAllPaths(s3Client: S3Client, initReq: ListObjectsV2Request): List[String] = {
def listAllObjectsV2(
s3Client: S3Client,
req: ListObjectsV2Request,
tokenOpt: Option[String],
isFirstTime: Boolean,
initList: ListBuffer[String]
): ListBuffer[String] = {
println(s"IsFirstTime = ${isFirstTime}, continuationToken = ${tokenOpt}")
(isFirstTime, tokenOpt) match {
case (true, Some(x)) =>
// this combo is not possible..
case (false, None) =>
// end
case (_, _) =>
// possible scenarios are :
// true, None : First iteration
// false, Some(x): Second iteration onwards
val response =
s3Client.listObjectsV2(tokenOpt.fold(req)(token => req.toBuilder.continuationToken(token).build()))
val keys: Seq[String] = response.contents().asScala.toList.map(_.key())
val nextTokenOpt = Option(response.nextContinuationToken())
listAllObjectsV2(s3Client, req, nextTokenOpt, isFirstTime = false, keys ++: initList)
listAllObjectsV2(s3Client, initReq, None, true, mutable.ListBuffer.empty[String]).toList
val s3Client = S3Client.builder().region(Region.US_WEST_2).build()
val request: ListObjectsV2Request =
.prefix(sourceKey + "/" + subFolderPrefix)
val listofAllKeys: List[String] = getAllPaths(s3Client, request)
val patterns = ctx.getBroadcastState(patternStateDescriptor)
The imports I made
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{MapStateDescriptor, ValueState, ValueStateDescriptor}
import org.apache.flink.api.scala.typeutils.Types
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.BroadcastStream
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
Here's the code
val env = StreamExecutionEnvironment.getExecutionEnvironment
val properties = new Properties()
val patternStream = new FlinkKafkaConsumer010("patterns", new SimpleStringSchema, properties)
val patterns = env.addSource(patternStream)
var patternData = patterns.map {
str =>
val splitted_str = str.split(",")
PatternStream(splitted_str(0).trim, splitted_str(1).trim, splitted_str(2).trim)
val logsStream = new FlinkKafkaConsumer010("logs", new SimpleStringSchema, properties)
// logsStream.setStartFromEarliest()
val logs = env.addSource(logsStream)
var data = logs.map {
str =>
val splitted_str = str.split(",")
LogsTest(splitted_str.head.trim, splitted_str(1).trim, splitted_str(2).trim)
val keyedData: KeyedStream[LogsTest, String] = data.keyBy(_.metric)
val bcStateDescriptor = new MapStateDescriptor[Unit, PatternStream]("patterns", Types.UNIT, Types.of[PatternStream]) // first type defined is for the key and second data type defined is for the value
val broadcastPatterns: BroadcastStream[PatternStream] = patternData.broadcast(bcStateDescriptor)
val alerts = keyedData
.process(new PatternEvaluator())
// println(alerts.getClass)
// val sinkProducer = new FlinkKafkaProducer010("output", new SimpleStringSchema(), properties)
env.execute("Flink Broadcast State Job")
class PatternEvaluator()
extends KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)] {
private lazy val patternStateDescriptor = new MapStateDescriptor("patterns", classOf[String], classOf[String])
private var lastMetricState: ValueState[String] = _
override def open(parameters: Configuration): Unit = {
val lastMetricDescriptor = new ValueStateDescriptor("last-metric", classOf[String])
lastMetricState = getRuntimeContext.getState(lastMetricDescriptor)
override def processElement(reading: LogsTest,
readOnlyCtx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#ReadOnlyContext,
out: Collector[(String, String, String)]): Unit = {
val metrics = readOnlyCtx.getBroadcastState(patternStateDescriptor)
if (metrics.contains(reading.metric)) {
val metricPattern: String = metrics.get(reading.metric)
val metricPatternValue: String = metrics.get(reading.value)
val lastMetric = lastMetricState.value()
val logsMetric = (reading.metric)
val logsValue = (reading.value)
if (logsMetric == metricPattern) {
if (metricPatternValue == logsValue) {
out.collect((reading.timestamp, reading.value, reading.metric))
override def processBroadcastElement(
update: PatternStream,
ctx: KeyedBroadcastProcessFunction[String, LogsTest, PatternStream, (String, String, String)]#Context,
out: Collector[(String, String, String)]
): Unit = {
val patterns = ctx.getBroadcastState(patternStateDescriptor)
if (update.metric == "IP") {
patterns.put(update.metric /*,update.operator*/ , update.value)
// else if (update.metric == "username"){
// patterns.put(update.metric, update.value)
// }
// else {
// println("No required data found")
// }
// }
Sample Data :- Logs Stream
"21/09/98","IP", ""
Pattern Stream
I'm unable to analyse data by getting desired result, i.e = 21/09/98,IP,
There's no error as of now, it's just not analysing the data
The code is reading streams (Checked)
One common source of trouble in cases like this is that the API offers no control over the order in which the patterns and the data are ingested. It could be that processElement is being called before processBroadcastElement.
I want to connect to my Amazon account in order to delete resources inside my s3 storage.
I have the access key and secret key, and this is how I started to build my connection to Amazon:
def connectToAmaozn(): Unit = {
val AWS_ACCESS_KEY=conf.getString("WebRecorder.PushSession.AccessKey")
val AWS_SECRET_KEY=conf.getString("WebRecorder.PushSession.SecretKey")
val AWSCredentials = new BasicAWSCredentials(AWS_ACCESS_KEY,AWS_SECRET_KEY)
Can you elaborate on how I may do this?
I used this solution to get bucket name and number of objects:
import scala.collection.JavaConversions._
import com.amazonaws.auth.{AWSStaticCredentialsProvider, BasicAWSCredentials}
import com.amazonaws.services.s3
import com.amazonaws.services.s3.model.{GetObjectTaggingRequest, ObjectListing, S3ObjectSummary}
import com.amazonaws.services.s3.{AmazonS3Client, AmazonS3ClientBuilder}
import com.clicktale.pipeline.framework.dal.ConfigParser.conf
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.auth.BasicAWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model._
import scala.language.postfixOps
class Amazon {
val AWS_ACCESS_KEY = conf.getString("WebRecorder.PushSession.AccessKey")
val AWS_SECRET_KEY = conf.getString("WebRecorder.PushSession.SecretKey")
val bucketName = "nv-q-s3-assets-01"
val provider = new AWSStaticCredentialsProvider(new BasicAWSCredentials(AWS_ACCESS_KEY, AWS_SECRET_KEY))
val client = AmazonS3ClientBuilder.standard().withCredentials(provider).withRegion("us-east-1").build()
// def connectToAmazon(): Unit = {
// val provider = new AWSStaticCredentialsProvider(new BasicAWSCredentials(AWS_ACCESS_KEY, AWS_SECRET_KEY))
// val client = AmazonS3ClientBuilder.standard().withCredentials(provider).withRegion("us-east-1").build()
def removeObjectsFromBucket(){
println("Removing objects from bucket...")
var object_listing: ObjectListing = client.listObjects(bucketName)
var flag: Boolean = true
while (flag) {
val iterator: Iterator[_] = object_listing.getObjectSummaries.iterator()
while (iterator.hasNext) {
val summary: S3ObjectSummary = iterator.next().asInstanceOf[S3ObjectSummary]
client.deleteObject(bucketName, summary.getKey())
def countNumberOfObjectsInsideBucket(): Unit ={
var object_listing: ObjectListing = client.listObjects(bucketName)
var flag: Boolean = true
var count=0
while (flag) {
val iterator: Iterator[_] = object_listing.getObjectSummaries.iterator()
while (iterator.hasNext) {
val summary: S3ObjectSummary = iterator.next().asInstanceOf[S3ObjectSummary]
println("Number of objects are: " + count)
You need a AWSCredentialsProvider:
val provider = new AWSStaticCredentialsProvider(
and then use it to create the client:
val client = AmazonS3ClientBuilder
.withRegion("us-west-1") // or whatever your region is
I am trying to implement topological sort using Spark's GraphX library.
This is the code I've written so far:
import java.util.ArrayList
import scala.collection.mutable.Queue
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.EdgeDirection
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
object MyObject {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("Spark-App").setMaster("local[2]")
val sc = new SparkContext(conf)
val resources: RDD[Resource] = makeResources(sc)
val relations: RDD[Relation] = makeRelations(sc)
println("Building graph ...")
var graph = buildGraph(resources, relations, sc)
println("Graph built!!")
println("Testing topo sort ...")
val topoSortResult = topoSort(graph, sc);
println("topoSortResult = " + topoSortResult)
println("Testing topo sort done!")
def buildGraph(resources: RDD[Resource], relations: RDD[Relation], sc: SparkContext): Graph[Resource, Relation] =
val vertices: RDD[(Long, Resource)] = resources.map(resource => (resource.id, resource))
val edges: RDD[Edge[Relation]] = relations.map(relation => Edge(relation.srcId, relation.dstId, relation))
var graph = Graph[Resource, Relation](vertices, edges)
def makeResources(sc: SparkContext): RDD[Resource] =
var list: List[Resource] = List()
list = list :+ new Resource(1L)
list = list :+ new Resource(2L)
list = list :+ new Resource(3L)
list = list :+ new Resource(4L)
list = list :+ new Resource(5L)
def makeRelations(sc: SparkContext): RDD[Relation] =
var list: List[Relation] = List()
list = list :+ new Relation(1L, "depends_on", 2L)
list = list :+ new Relation(3L, "depends_on", 2L)
list = list :+ new Relation(4L, "depends_on", 2L)
list = list :+ new Relation(5L, "depends_on", 2L)
def topoSort(graph: Graph[Resource, Relation], sc: SparkContext): java.util.List[(VertexId, Resource)] =
// Will contain the result
val sortedResources: java.util.List[(VertexId, Resource)] = new ArrayList()
// Contains all the vertices
val vertices = graph.vertices
// Contains all the vertices whose in-degree > 0
val inDegrees = graph.inDegrees;
val inDegreesKeys_array = inDegrees.keys.collect();
// Contains all the vertices whose in-degree == 0
val inDegreeZeroList = vertices.filter(vertex => !inDegreesKeys_array.contains(vertex._1))
// A map of vertexID vs its in-degree
val inDegreeMapRDD = inDegreeZeroList.map(vertex => (vertex._1, 0)).union(inDegrees);
// Insert all the resources whose in-degree == 0 into a queue
val queue = new Queue[(VertexId, Resource)]
for (vertex <- inDegreeZeroList.toLocalIterator) { queue.enqueue(vertex) }
// Get an RDD containing the outgoing edges of every vertex
val neighbours = graph.collectNeighbors(EdgeDirection.Out)
// Initiate the algorithm
while (!queue.isEmpty) {
val vertex_top = queue.dequeue()
// Add the topmost element of the queue to the result
// Get the neigbours (from outgoing edges) of this vertex
// This will be an RDD containing just 1 element which will be an array of neighbour vertices
val vertex_neighbours = neighbours.filter(vertex => vertex._1.equals(vertex_top._1))
// For each vertex, decrease its in-degree by 1
vertex_neighbours.foreach(arr => {
val neighbour_array = arr._2
neighbour_array.foreach(vertex => {
val oldInDegree = inDegreeMapRDD.filter(vertex_iter => (vertex_iter._1 == vertex._1)).first()._2
val newInDegree = oldInDegree - 1
// Reflect the new in-degree in the in-degree map RDD
inDegreeMapRDD.map(vertex_iter => {
if (vertex_iter._1 == vertex._1) {
(vertex._1, newInDegree)
// Add this vertex to the result if its in-degree has become zero
if (newInDegree == 0) {
return sortedResources
class Resource(val id: Long) extends Serializable {
override def toString(): String = {
"id = " + id
class Relation(val srcId: Long, val name: String, val dstId: Long) extends Serializable {
override def toString(): String = {
srcId + " " + name + " " + dstId
I am getting the error :
org.apache.spark.SparkException: RDD transformations and actions can only be invoked by the driver, not inside of other transformations; for example, rdd1.map(x => rdd2.values.count() * x) is invalid because the values transformation and count action cannot be performed inside of the rdd1.map transformation. For more information, see SPARK-5063.
for the line val oldInDegree = inDegreeMapRDD.filter(vertex_iter => (vertex_iter._1 == vertex._1)).first()._2.
I guess this is because it is illegal to modify an RDD inside the for-each loop of some other RDD.
Also, I fear that queue.enqueue(vertex) will not work, since it is not possible to modify a local collection inside a for-each loop.
How do I correctly implement this topological sort algorithm ?
The full stack trace of the exception is uploaded here (Had to upload it externally to prevent exceeding the body size limit of StackOverflow).
vertex_neighbours.foreach(arr => {
val neighbour_array = arr._2
neighbour_array.foreach(vertex => {
. . .
The outer foreach could be replaced by a for loop.
val vertex_neighbours = neighbours.filter(vertex => vertex._1.equals(vertex_top._1)).collect()
You need to get the RDD before doing for loop over it.
I'm making my first steps in Scala and trying to implement application which uses Twitter streaming API. Below is my code (user tokens are hidden). From main function, I call getStreamData function, which calls makeAPIrequest.
package com.myname.myapp
import java.net.URL
import javax.net.ssl.HttpsURLConnection
import java.io.InputStream
import java.io.OutputStream;
import scala.io.Source
import java.net.URLEncoder
import java.util.Base64
import java.nio.charset.StandardCharsets
import scala.collection.immutable.HashMap
import java.util.Calendar
import java.io.Serializable
import scala.collection.immutable.TreeMap
import javax.crypto
import java.security.SecureRandom
import java.math.BigInteger
import scala.util.Random
object TwitterConnector {
private val AUTH_URL: String = "https://api.twitter.com/oauth2/token"
private val CONSUMER_KEY: String = "mykey"
private val CONSUMER_SECRET: String = "mysecret"
private val STREAM_URL: String = "https://stream.twitter.com/1.1/statuses/filter.json"
private var TOKEN: String = "mytoken"
private var TOKEN_SECRET: String = "mytokensecret"
def getStreamData {
val data = "track=" + "twitter"
makeAPIrequest(HTTPmethod("POST"), "https://stream.twitter.com/1.1/statuses/filter.json", None, Option(data))
private def makeAPIrequest(method: HTTPmethod, url:String, urlParams:Option[String], data:Option[String]){
//form oauth parameters
val oauth_nonce = Random.alphanumeric.take(32).mkString
val oauth_signature_method: String = "HMAC-SHA1"
val oauth_version: String = "1.0"
val oauth_timestamp = (Calendar.getInstance.getTimeInMillis/1000).toString()
var signatureData = scala.collection.mutable.Map(("oauth_consumer_key", CONSUMER_KEY), ("oauth_token", TOKEN), ("oauth_signature_method", oauth_signature_method), ("oauth_nonce", oauth_nonce), ("oauth_timestamp", oauth_timestamp), ("oauth_version", oauth_version))
//find keys for parameters
val getParams = (parameter: String) => {
val arr = parameter.split("=")
if(arr.length == 1) return
val key = arr(0).asInstanceOf[String]
val value = arr(1).asInstanceOf[String]
signatureData(key) = value
val params = urlParams match {
case Some(value) => {
val result = urlParams.get
result.split("&").foreach {getParams}
case None => ""
val postData = data match {
case Some(value) => {
val result = data.get
result.split("&").foreach {getParams}
case None => ""
//url-encode headers data
signatureData.foreach { elem => {
signatureData(urlEnc(elem._1)) = urlEnc(elem._2)
//sort headers data
val sortedSignatureData = TreeMap(signatureData.toSeq:_*)
println("Sorted: " + sortedSignatureData)
//form output string
var parameterString = ""
sortedSignatureData.foreach(elem => {
if(parameterString.length() > 0){
parameterString += "&"
parameterString += elem._1 + "=" + elem._2
val outputString = method.method.toUpperCase() + "&" + urlEnc(url) + "&" + urlEnc(parameterString)
val signingKey = urlEnc(CONSUMER_SECRET) + "&" + urlEnc(TOKEN_SECRET)
val SHA1 = "HmacSHA1";
val key = new crypto.spec.SecretKeySpec(bytes(signingKey), SHA1)
val oauth_signature = {
val mac = crypto.Mac.getInstance(SHA1)
new String(base64(mac.doFinal(bytes(outputString)).toString()))
println("Signature: " + oauth_signature)
val authHeader: String = "OAuth oauth_consumer_key=\"" + urlEnc(CONSUMER_KEY) + "\", oauth_nonce=\"" + urlEnc(oauth_nonce) + "\", oauth_signature=\"" + urlEnc(oauth_signature) + "\", oauth_signature_method=\"HMAC-SHA1\", oauth_timestamp=\"" + urlEnc(oauth_timestamp) + "\", oauth_token=\"" + urlEnc(TOKEN) + "\", oauth_version=\"1.0\""
var text = url
if(params.length > 0){
text += "?"
val apiURL: URL = new URL(text + params)
val apiConnection: HttpsURLConnection = apiURL.openConnection.asInstanceOf[HttpsURLConnection]
apiConnection.setRequestProperty("Authorization", authHeader)
apiConnection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
if(method.method == "POST" && postData.length() > 0){
println("POSTING ", postData)
val outStream: OutputStream = apiConnection.getOutputStream
val inStream: InputStream = apiConnection.getInputStream
val serverResponse = Source.fromInputStream(inStream).mkString
private def bytes(str: String) = str.getBytes("UTF-8")
private def urlEnc(str: String) = URLEncoder.encode(str, "UTF-8").replace(" ", "%20")
private def base64(str: String) = Base64.getEncoder.encodeToString(str.getBytes(StandardCharsets.UTF_8))
Twitter returns me 401 code response.
Obviously, I'm doing something wrong. Could you point me where my error is?
I recommend using a better library for making web requests, such as the WS library from the Play Framework. Right now, you're sort of writing Java in Scala. Here's a sample usage of the WS library:
val clientConfig = new DefaultWSClientConfig()
val secureDefaults: com.ning.http.client.AsyncHttpClientConfig = new NingAsyncHttpClientConfigBuilder(clientConfig).build()
val builder = new com.ning.http.client.AsyncHttpClientConfig.Builder(secureDefaults)
val secureDefaultsWithSpecificOptions: com.ning.http.client.AsyncHttpClientConfig = builder.build()
implicit val implicitClient = new play.api.libs.ws.ning.NingWSClient(secureDefaultsWithSpecificOptions)
val oauthCalc = OAuthCalculator(ConsumerKey(TwitterConfig.consumerKey, TwitterConfig.consumerSecret), RequestToken(TwitterConfig.accessKey, TwitterConfig.accessSecret))
def lookup(ids: List[String]): Future[List[Tweet]] =
.withQueryString("id" -> ids.mkString(","))
.map { r =>
You should be able to modify this example pretty easily to work with the streaming API.
Given a stream of homogeneous typed object, how would I go about serializing them to binary, writing them to disk, reading them from disk and then deserializing them using Scala Pickling?
For example:
object PicklingIteratorExample extends App {
import scala.pickling.Defaults._
import scala.pickling.binary._
import scala.pickling.static._
case class Person(name: String, age: Int)
val personsIt = Iterator.from(0).take(10).map(i => Person(i.toString, i))
val pklsIt = personsIt.map(_.pickle)
??? // Write to disk
val readIt: Iterator[Person] = ??? // Read from disk and unpickle
I find a way to so for standard files:
object PickleIOExample extends App {
import scala.pickling.Defaults._
import scala.pickling.binary._
import scala.pickling.static._
val tempPath = File.createTempFile("pickling", ".gz").getAbsolutePath
val outputStream = new FileOutputStream(tempPath)
val inputStream = new FileInputStream(tempPath)
val persons = for{
i <- 1 to 100
} yield Person(i.toString, i)
val output = new StreamOutput(outputStream)
val personsIt = new Iterator[Person]{
val streamPickle = BinaryPickle(inputStream)
override def hasNext: Boolean = inputStream.available > 0
override def next(): Person = streamPickle.unpickle[Person]
println(personsIt.mkString(", "))
But I am still unable to find a solution that will work with gzipped files. Since I do not know how to detect the EOF? The following throws an EOFexception since GZIPInputStream available method does not indicate the EOF:
object PickleIOExample extends App {
import scala.pickling.Defaults._
import scala.pickling.binary._
import scala.pickling.static._
val tempPath = File.createTempFile("pickling", ".gz").getAbsolutePath
val outputStream = new GZIPOutputStream(new FileOutputStream(tempPath))
val inputStream = new GZIPInputStream(new FileInputStream(tempPath))
val persons = for{
i <- 1 to 100
} yield Person(i.toString, i)
val output = new StreamOutput(outputStream)
val personsIt = new Iterator[Person]{
val streamPickle = BinaryPickle(inputStream)
override def hasNext: Boolean = inputStream.available > 0
override def next(): Person = streamPickle.unpickle[Person]
println(personsIt.mkString(", "))