Round-tripping through Deflater in Scala fails - scala

I'm looking to roundtrip bytes through java's Deflater and running into issues. First the output, then the code. What am I doing wrong here, and how can I properly round trip through these streams?
Output:
scala> new String(decompress(compress("face".getBytes)))
(crazy output string of length 20)
Code:
def compress(bytes: Array[Byte]): Array[Byte] = {
val deflater = new java.util.zip.Deflater
val baos = new ByteArrayOutputStream
val dos = new DeflaterOutputStream(baos, deflater)
dos.write(bytes)
baos.close
dos.finish
dos.close
baos.toByteArray
}
def decompress(bytes: Array[Byte]): Array[Byte] = {
val deflater = new java.util.zip.Deflater
val baos = new ByteArrayOutputStream(512)
val bytesIn = new ByteArrayInputStream(bytes)
val in = new DeflaterInputStream(bytesIn, deflater)
var go = true
while (go) {
val b = in.read
if (b == -1)
go = false
else
baos.write(b)
}
baos.close
in.close
baos.toByteArray
}

You're (re-)Deflater-ing the result of the original deflation when you should be Inflater-ing it...

Related

Scala error when try to write Output File

I found this error when try to write a new output file from the code:
java.io.FileNotFoundException: s3a:/tfsdl-ghd-wb/raidnd/Incte_19&20.csv
val inputFile = "s3a:/tfsdl-ghd-wb/raidnd/rawdata.csv" // INPUT path
val outputFile = "s3a:/tfsdl-ghd-wb/raidnd/Incte_19&20.csv" // OUTPUT path
val dateFormat = new SimpleDateFormat("yyyyMMdd")
val fileSystem = getFileSystem(inputFile)
val inputData = readCSVFile(fileSystem, inputFile, skipHeader = true).toSeq
val writer = new PrintWriter(new File(outputFile))
writer.write("Sales,cust,Number,Date,Credit,SKU\n")
filtinp.foreach(x => {
val (com1, avg1) = com1Average(filtermp, x)
val (com2, avg2) = com2Average(filtermp, x)
writer.write(s"${x.Date},${x.cust},${x.Number},${x.Credit}\n")
})
writer.close()
def getFileSystem(path: String): FileSystem = {
val hconf = new Configuration() // initialize new hadoop configuration
new Path(path).getFileSystem(hconf) // get new filesystem to handle data

Scala cipher AES decryption does not work -- conditionally unable to decrypt file -- padding error

Scala 2.11.8, help with encryption... This follows principles from this stackoverflow site and gives an error (javax.crypto.BadPaddingException: Given final block not properly padded). I know why the error is caused, but, need help in handling it. Note: error occurs when the decrypt code is executed separately (i.e. on a different window, separate spark-shell instance). Rarely, the error occurs when both encrypt and decrypt are in the same instance... The salt and IvSpec are copied to the separate instance --as shown at the end (Note: I have verified that the bytes are the same in both instances)...
import java.io.{BufferedWriter, File, FileWriter, FileInputStream, FileOutputStream, BufferedInputStream, BufferedOutputStream, DataInputStream, DataOutputStream}
import org.apache.commons.io.FileUtils;
import javax.crypto.{Cipher, SecretKey, SecretKeyFactory, CipherInputStream, CipherOutputStream}
import javax.crypto.spec.{IvParameterSpec, SecretKeySpec, PBEKeySpec}
import java.security.SecureRandom
import scala.util.Random
import scala.math.pow
val password = "Let us test this"
val random = new SecureRandom();
val salt = Array.fill[Byte](16)(0)
random.nextBytes(salt)
val IvSpec1 = Array.fill[Byte](16)(0)
random.nextBytes(IvSpec1)
val IvSpec = new IvParameterSpec(IvSpec1)
def password_to_key(password: String, salt: Array[Byte]): SecretKeySpec = {
val spec = new PBEKeySpec(password.toCharArray(), salt, 65536, 256);
val f = SecretKeyFactory.getInstance("PBKDF2WithHmacSHA1");
val key = f.generateSecret(spec).getEncoded()
new SecretKeySpec(key, "AES")
}
val Key = password_to_key(password, salt)
val Algorithm = "AES/CBC/PKCS5Padding"
val cipher_encrypt = Cipher.getInstance(Algorithm)
cipher_encrypt.init(Cipher.ENCRYPT_MODE, Key, IvSpec)
val cipher_decrypt = Cipher.getInstance(Algorithm)
cipher_decrypt.init(Cipher.DECRYPT_MODE, Key, IvSpec)
def encrypt(file_in: String, file_out: String, cipher_encrypt:javax.crypto.Cipher ) {
val in = new BufferedInputStream(new FileInputStream(file_in))
val out = new BufferedOutputStream(new FileOutputStream(file_out))
val out_encrypted = new CipherOutputStream(out, cipher_encrypt)
val bufferSize = 1024 * pow(2,4).toInt
val bb = new Array[Byte](bufferSize)
var bb_read = in.read(bb, 0, bufferSize)
while (bb_read > 0) {
out_encrypted.write(bb, 0, bb_read)
bb_read = in.read(bb, 0, bufferSize)
}
in.close()
out_encrypted.close()
out.close()
}
def decrypt(file_in: String, file_out: String, cipher_decrypt:javax.crypto.Cipher ) {
val in = new BufferedInputStream(new FileInputStream(file_in))
val in_decrypted = new CipherInputStream(in, cipher_decrypt)
val out = new BufferedOutputStream(new FileOutputStream(file_out))
val bufferSize = 1024 * pow(2,4).toInt
val bb = new Array[Byte](bufferSize)
var bb_read = in_decrypted.read(bb, 0, bufferSize)
while (bb_read >0 ) {
out.write(bb, 0, bb_read)
bb_read = in_decrypted.read(bb, 0, bufferSize)
}
in_decrypted.close()
in.close()
out.close()
}
val file_in = "test.csv"
val file_encrypt = "test_encrypt.csv"
val file_decrypt = "test_decrypt.csv"
encrypt(file_in, file_encrypt, cipher_encrypt)
decrypt( file_encrypt, file_decrypt, cipher_decrypt)
// To write salt, IvSpec (to re-read it in a separate instance...)
val salt_loc = new File("salt.txt")
val IvSpec_loc = new File("IvSpec.txt")
val salt_w = new FileOutputStream(salt_loc)
salt_w.write(salt)
salt_w.close()
val IvSpec_w = new FileOutputStream(IvSpec_loc)
IvSpec_w.write(IvSpec1)
IvSpec_w.close()
//to re-read salt and IvSpec in a separate instance...
//Ignore that we do not need to re-read IvSpec
val salt_loc = new File("salt.txt")
val IvSpec_loc = new File("IvSpec.txt")
val salt_r = new FileInputStream(salt_loc)
val salt = Stream.continually(salt_r.read).takeWhile(-1 !=).map(_.toByte).toArray
val IvSpec_r = new FileInputStream(IvSpec_loc)
val IvSpec1 = Stream.continually(IvSpec_r.read).takeWhile(-1 !=).map(_.toByte).toArray
val IvSpec = new IvParameterSpec(IvSpec1)
When the decrypt code is executed in a separate java process, this definitely gives an error (error is related to padding). This works most times (95%+), when encrypt and decrypt are done in the same script (e.g. above will work most of the time - if executed in the same instance). If decrypt is done separately, by getting the salt and IvSpec in a separate window/process/thread/java instance, it fails.
error is java.io.IOException: javax.crypto.BadPaddingException: Given final block not properly padded
at javax.crypto.CipherInputStream.getMoreData(CipherInputStream.java:121)
at javax.crypto.CipherInputStream.read(CipherInputStream.java:239)
at javax.crypto.CipherInputStream.read(CipherInputStream.java:215)
at decrypt3(<console>:81)
... 60 elided
Caused by: javax.crypto.BadPaddingException: Given final block not properly padded
at com.sun.crypto.provider.CipherCore.doFinal(CipherCore.java:991)
at com.sun.crypto.provider.CipherCore.doFinal(CipherCore.java:847)
at com.sun.crypto.provider.AESCipher.engineDoFinal(AESCipher.java:446)
at javax.crypto.Cipher.doFinal(Cipher.java:2047)
at javax.crypto.CipherInputStream.getMoreData(CipherInputStream.java:118)
... 63 more

recursive value x$5 needs type

i am getting error at this line
val Array(outputDirectory, Utils.IntParam(numTweetsToCollect), Utils.IntParam(intervalSecs), Utils.IntParam(partitionsEachInterval)) =
Utils.parseCommandLineWithTwitterCredentials(args)
recursive value x$7 needs
type
recursive value x$1 needs
type
what does this error meaning, please guide me how to resolve this error.
object Collect {
private var numTweetsCollected = 0L
private var partNum = 0
private var gson = new Gson()
def main(args: Array[String]) {
// Process program arguments and set properties
if (args.length < 3) {
System.err.println("Usage: " + this.getClass.getSimpleName +
"<outputDirectory> <numTweetsToCollect> <intervalInSeconds> <partitionsEachInterval>")
System.exit(1)
}
val Array(outputDirectory, Utils.IntParam(numTweetsToCollect), Utils.IntParam(intervalSecs), Utils.IntParam(partitionsEachInterval)) =
Utils.parseCommandLineWithTwitterCredentials(args)
val outputDir = new File(outputDirectory.toString)
if (outputDir.exists()) {
System.err.println("ERROR - %s already exists: delete or specify another directory".format(
outputDirectory))
System.exit(1)
}
outputDir.mkdirs()
println("Initializing Streaming Spark Context...")
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(intervalSecs))
val tweetStream = TwitterUtils.createStream(ssc, Utils.getAuth)
.map(gson.toJson(_))
tweetStream.foreachRDD((rdd, time) => {
val count = rdd.count()
if (count > 0) {
val outputRDD = rdd.repartition(partitionsEachInterval)
outputRDD.saveAsTextFile(outputDirectory + "/tweets_" + time.milliseconds.toString)
numTweetsCollected += count
if (numTweetsCollected > numTweetsToCollect) {
System.exit(0)
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
Try removing the Utils.IntParam(.. from your pattern matched values. Extract the values, then parse them separately.

Serializing to disk and deserializing Scala objects using Pickling

Given a stream of homogeneous typed object, how would I go about serializing them to binary, writing them to disk, reading them from disk and then deserializing them using Scala Pickling?
For example:
object PicklingIteratorExample extends App {
import scala.pickling.Defaults._
import scala.pickling.binary._
import scala.pickling.static._
case class Person(name: String, age: Int)
val personsIt = Iterator.from(0).take(10).map(i => Person(i.toString, i))
val pklsIt = personsIt.map(_.pickle)
??? // Write to disk
val readIt: Iterator[Person] = ??? // Read from disk and unpickle
}
I find a way to so for standard files:
object PickleIOExample extends App {
import scala.pickling.Defaults._
import scala.pickling.binary._
import scala.pickling.static._
val tempPath = File.createTempFile("pickling", ".gz").getAbsolutePath
val outputStream = new FileOutputStream(tempPath)
val inputStream = new FileInputStream(tempPath)
val persons = for{
i <- 1 to 100
} yield Person(i.toString, i)
val output = new StreamOutput(outputStream)
persons.foreach(_.pickleTo(output))
outputStream.close()
val personsIt = new Iterator[Person]{
val streamPickle = BinaryPickle(inputStream)
override def hasNext: Boolean = inputStream.available > 0
override def next(): Person = streamPickle.unpickle[Person]
}
println(personsIt.mkString(", "))
inputStream.close()
}
But I am still unable to find a solution that will work with gzipped files. Since I do not know how to detect the EOF? The following throws an EOFexception since GZIPInputStream available method does not indicate the EOF:
object PickleIOExample extends App {
import scala.pickling.Defaults._
import scala.pickling.binary._
import scala.pickling.static._
val tempPath = File.createTempFile("pickling", ".gz").getAbsolutePath
val outputStream = new GZIPOutputStream(new FileOutputStream(tempPath))
val inputStream = new GZIPInputStream(new FileInputStream(tempPath))
val persons = for{
i <- 1 to 100
} yield Person(i.toString, i)
val output = new StreamOutput(outputStream)
persons.foreach(_.pickleTo(output))
outputStream.close()
val personsIt = new Iterator[Person]{
val streamPickle = BinaryPickle(inputStream)
override def hasNext: Boolean = inputStream.available > 0
override def next(): Person = streamPickle.unpickle[Person]
}
println(personsIt.mkString(", "))
inputStream.close()
}

Reading lines and raw bytes from the same source in scala

I need to write code that does the following:
Connect to a tcp socket
Read a line ending in "\r\n" that contains a number N
Read N bytes
Use those N bytes
I am currently using the following code:
val socket = new Socket(InetAddress.getByName(host), port)
val in = socket.getInputStream;
val out = new PrintStream(socket.getOutputStream)
val reader = new DataInputStream(in)
val baos = new ByteArrayOutputStream
val buffer = new Array[Byte](1024)
out.print(cmd + "\r\n")
out.flush
val firstLine = reader.readLine.split("\\s")
if(firstLine(0) == "OK") {
def read(written: Int, max: Int, baos: ByteArrayOutputStream): Array[Byte] = {
if(written >= max) baos.toByteArray
else {
val count = reader.read(buffer, 0, buffer.length)
baos.write(buffer, 0, count)
read(written + count, max, baos)
}
}
read(0, firstLine(1).toInt, baos)
} else {
// RAISE something
}
baos.toByteArray()
The problem with this code is that the use of DataInputStream#readLine raises a deprecation warning, but I can't find a class that implements both read(...) and readLine(...). BufferedReader for example, implements read but it reads Chars and not Bytes. I could cast those chars to bytes but I don't think it's safe.
Any other ways to write something like this in scala?
Thank you
be aware that on the JVM a char has 2 bytes, so "\r\n" is 4 bytes. This is generally not true for Strings stored outside of the JVM.
I think the safest way would be to read your file in raw bytes until you reache your Binary representation of "\r\n", now you can create a Reader (makes bytes into JVM compatible chars) on the first bytes, where you can be shure that there is Text only, parse it, and contiue safely with the rest of the binary data.
You can achive the goal to use read(...) and readLine(...) in one class. The idea is use BufferedReader.read():Int. The BufferedReader class has buffered the content so you can read one byte a time without performance decrease.
The change can be: (without scala style optimization)
import java.io.BufferedInputStream
import java.io.BufferedReader
import java.io.ByteArrayOutputStream
import java.io.PrintStream
import java.net.InetAddress
import java.net.Socket
import java.io.InputStreamReader
object ReadLines extends App {
val host = "127.0.0.1"
val port = 9090
val socket = new Socket(InetAddress.getByName(host), port)
val in = socket.getInputStream;
val out = new PrintStream(socket.getOutputStream)
// val reader = new DataInputStream(in)
val bufIns = new BufferedInputStream(in)
val reader = new BufferedReader(new InputStreamReader(bufIns, "utf8"));
val baos = new ByteArrayOutputStream
val buffer = new Array[Byte](1024)
val cmd = "get:"
out.print(cmd + "\r\n")
out.flush
val firstLine = reader.readLine.split("\\s")
if (firstLine(0) == "OK") {
def read(written: Int, max: Int, baos: ByteArrayOutputStream): Array[Byte] = {
if (written >= max) {
println("get: " + new String(baos.toByteArray))
baos.toByteArray()
} else {
// val count = reader.read(buffer, 0, buffer.length)
var count = 0
var b = reader.read()
while(b != -1){
buffer(count) = b.toByte
count += 1
if (count < max){
b = reader.read()
}else{
b = -1
}
}
baos.write(buffer, 0, count)
read(written + count, max, baos)
}
}
read(0, firstLine(1).toInt, baos)
} else {
// RAISE something
}
baos.toByteArray()
}
for test, below is a server code:
object ReadLinesServer extends App {
val serverSocket = new ServerSocket(9090)
while(true){
println("accepted a connection.")
val socket = serverSocket.accept()
val ops = socket.getOutputStream()
val printStream = new PrintStream(ops, true, "utf8")
printStream.print("OK 2\r\n") // 1 byte for alpha-number char
printStream.print("ab")
}
}
Seems this is the best solution I can find:
val reader = new BufferedReader(new InputStreamReader(in))
val buffer = new Array[Char](1024)
out.print(cmd + "\r\n")
out.flush
val firstLine = reader.readLine.split("\\s")
if(firstLine(0) == "OK") {
def read(readCount: Int, acc: List[Byte]): Array[Byte] = {
if(readCount <= 0) acc.toArray
else {
val count = reader.read(buffer, 0, buffer.length)
val asBytes = buffer.slice(0, count).map(_.toByte)
read(readCount - count, acc ++ asBytes)
}
}
read(firstLine(1).toInt, List[Byte]())
} else {
// RAISE
}
That is, use buffer.map(_.toByte).toArray to transform a char Array into a Byte Array without caring about the encoding.