I have a CSV file that I need to parse and do some action on every record. How do I use Free Monads with it? Currently, I'm loading the entire file into memory and would like to know if there is any better solution. Below is my program:
for {
reader <- F.getReader("my_file.csv")
csvRecords <- C.readCSV(reader)
_ <- I.processCSV(csvRecords)
_ <- F.close(reader)
} yield()
This code works for smaller files, but if I have very large files (over 1 GB), this wouldn't work very well. I'm using Commons CSV for reading the CSVRecords.
Looking into the code at your gist I think that the line with the comment is exactly the line you don't want at all:
object CSVIOInterpreter extends (CSVIO ~> Future) {
import scala.collection.JavaConverters._
override def apply[A](fa: CSVIO[A]): Future[A] = fa match {
case ReadCSV(reader) => Future.fromTry(Try {
CSVFormat.RFC4180
.withFirstRecordAsHeader()
.parse(reader)
.getRecords // Loads the complete file
.iterator().asScala.toStream
})
}
}
Just remove the whole getRecords line. CSVFormat.parse returns an instance of CSVParser which already implements Iterable<CSVRecord>. And the getRecords call is the only thing that force it to read the whole file.
Actually you can see CSVParser.getRecords implementation and it is
public List<CSVRecord> getRecords() throws IOException {
CSVRecord rec;
final List<CSVRecord> records = new ArrayList<>();
while ((rec = this.nextRecord()) != null) {
records.add(rec);
}
return records;
}
So it just materializes the whole file using this.nextRecord call which is obviously a more "core" part of the API.
So when I do a simplified version of your code without the getRecords call:
import cats._
import cats.free.Free
import java.io._
import org.apache.commons.csv._
import scala.collection.JavaConverters._
trait Action[A] {
def run(): A
}
object F {
import Free.liftF
case class GetReader(fileName: String) extends Action[Reader] {
override def run(): Reader = new FileReader(fileName)
}
case class CloseReader(reader: Reader) extends Action[Unit] {
override def run(): Unit = reader.close()
}
def getReader(fileName: String): Free[Action, Reader] = liftF(GetReader(fileName))
def close(reader: Reader): Free[Action, Unit] = liftF(CloseReader(reader))
}
object C {
import Free.liftF
case class ReadCSV(reader: Reader) extends Action[CSVParser] {
override def run(): CSVParser = CSVFormat.DEFAULT.parse(reader)
}
def readCSV(reader: Reader): Free[Action, CSVParser] = liftF(ReadCSV(reader))
}
object I {
import Free.liftF
case class ProcessCSV(parser: CSVParser) extends Action[Unit] {
override def run(): Unit = {
for (r <- parser.asScala)
println(r)
}
}
def processCSV(parser: CSVParser): Free[Action, Unit] = liftF(ProcessCSV(parser))
}
object Runner {
import cats.arrow.FunctionK
import cats.{Id, ~>}
val runner = new (Action ~> Id) {
def apply[A](fa: Action[A]): Id[A] = fa.run()
}
def run[A](free: Free[Action, A]): A = {
free.foldMap(runner)
}
}
def test() = {
val free = for {
// reader <- F.getReader("my_file.csv")
reader <- F.getReader("AssetsImportCompleteSample.csv")
csvRecords <- C.readCSV(reader)
_ <- I.processCSV(csvRecords)
_ <- F.close(reader)
} yield ()
Runner.run(free)
}
it seems to work OK in line-by-line mode.
Here how I use the CSV file to read and do some operation on that -
I use scala.io.Source.fromFile()
I create one case class of the type of header of CSV file to make the data more accessible and operational.
PS: I don't have knowledge of monads, as well as I am in beginner in Scala. I posted this as it may be helpful.
case class AirportData(id:Int, ident:String, name:String, typeAirport:String, latitude_deg:Double,
longitude_deg:Double, elevation_ft:Double, continent:String, iso_country:String, iso_region:String,
municipality:String)
object AirportData extends App {
def toDoubleOrNeg(s: String): Double = {
try {
s.toDouble
} catch {
case _: NumberFormatException => -1
}
}
val source = scala.io.Source.fromFile("resources/airportData/airports.csv")
val lines = source.getLines().drop(1)
val data = lines.flatMap { line =>
val p = line.split(",")
Seq(AirportData(p(0).toInt, p(1).toString, p(2).toString, p(3).toString, toDoubleOrNeg(p(4)), toDoubleOrNeg(p(5)),
toDoubleOrNeg(p(6)), p(7).toString, p(8).toString, p(9).toString, p(10).toString))
}.toArray
source.close()
println(data.length)
data.take(10) foreach println
}
Related
I am trying to setup tests for a scala compiler-plugin I am developping.
According to this similar question, it is possible to invoke the plugin programmatically. However, when I tried to do this the program could not find my plugin.
Error:(34, 25) not found: type GetFileFromAnnotation
for (phase <- new GetFileFromAnnotation(this).components)
Both files are in the same package.
Here is the code which is trying to test the plugin (pulled from the question above), the commented out code is an alternative to the code above it and throws the same error:
package compilerPlugin
import scala.reflect.internal.util.BatchSourceFile
import scala.tools.nsc.io.VirtualDirectory
import scala.tools.nsc.reporters.ConsoleReporter
import scala.tools.nsc.{Global, Settings}
object AnnotationFinderTest extends App {
// prepare the code you want to compile
val code =
"""
|class Typestate(filename:String) extends scala.annotation.StaticAnnotation
|
|#Typestate(filename = "MyProtocol.txt")
|class Cat{
| def comeAlive(): Unit = println("The cat is alive")
|}
|
|object Main extends App {
| val cat = new Cat()
| cat.comeAlive()
|}""".stripMargin
val sources = List(new BatchSourceFile("<test>", code))
println("sources "+sources)
val settings = new Settings
settings.usejavacp.value = true
settings.outputDirs.setSingleOutput(new VirtualDirectory("(memory)", None))
val compiler = new Global(settings, new ConsoleReporter(settings)) {
override protected def computeInternalPhases () {
super.computeInternalPhases
for (phase <- new GetFileFromAnnotation(this).components)
phasesSet += phase
}
}
new compiler.Run() compileSources (sources)
}
The code for the GetFileFromAnnotation plugin is:
package compilerPlugin
import java.io.{FileNotFoundException, IOException}
import scala.tools.nsc
import nsc.Global
import nsc.Phase
import nsc.plugins.Plugin
import nsc.plugins.PluginComponent
import scala.io.BufferedSource
import scala.io.Source._
class GetFileFromAnnotation(val global: Global) extends Plugin {
import global._
val name = "GetFileFromAnnotation"
val description = "gets file from typestate annotation"
val components: List[PluginComponent] = List[PluginComponent](Component)
private object Component extends PluginComponent {
val global: GetFileFromAnnotation.this.global.type = GetFileFromAnnotation.this.global
val runsAfter: List[String] = List[String]("parser")
val phaseName: String = GetFileFromAnnotation.this.name
def newPhase(_prev: Phase) = new GetFileFromAnnotationPhase(_prev)
class GetFileFromAnnotationPhase(prev: Phase) extends StdPhase(prev) {
override def name: String = GetFileFromAnnotation.this.name
def printFile(filename: String): Unit ={
val source = fromFile(filename)
try {
val it = source.getLines()
while (it.hasNext)
println(it.next())
}
catch{
case e: IOException => println(s"Had an IOException trying to use file $filename")
} finally {
source.close
}
}
def getFilenameFromAnnotation(annotation: Apply): Option[String] ={
annotation match{
case Apply(Select(New(Ident(TypeName("Typestate"))), con),List(NamedArg(Ident(TermName("filename")), Literal(Constant(filename))))) => Some(filename.toString)
case Apply(Select(New(Ident(TypeName("Typestate"))), con),List(Literal(Constant(filename)))) => Some(filename.toString)
case _ => None
}
}
def apply(unit: CompilationUnit): Unit = {
for (tree#q"$mods class $tpname[..$tparams] $ctorMods(...$paramss) extends { ..$earlydefns } with ..$parents { $self => ..$stats }" <- unit.body) {
val annotations = mods.annotations
for(annotation#Apply(arg1,arg2) <- annotations){
getFilenameFromAnnotation(annotation) match{
case Some(filename) => printFile(filename)
case None => println("Not a Typestate annotation")
}
}
}
}
}
}
}
Is it even feasible to use this method to test a plugin or is the only way to do it to use mocking? Is there another way of doing this entirely?
UPDATE:
Thanks for the comment and the github example.
It ended up working fine with the edited test code above and prints out the protocol file as expected. I am not sure why it wasn't working before.
I am trying to read a text file to compare 2 files. I have written the code to read the first file and I am expecting readFileStream function to give me Collections of String but I am getting only String.
Could you see where I have made wrong?
import java.io.{BufferedReader, FileInputStream, InputStreamReader}
import java.net.URI
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import scala.util.{Failure, Success, Try}
object TestCompareHDFSFiles {
def main(args: Array[String]): Unit = {
val hdfs = FileSystem.get(new Configuration())
val path1 = new Path(args(0))
val path2 = new Path(args(1))
readHDFSFile(hdfs, path1, path2)
}
// Accept a parameter which implements a close method
def using[A <: { def close(): Unit }, B](resource: A)(f: A => B): B =
try {
f(resource)
} finally {
resource.close()
}
def readHDFSFile(hdfs: FileSystem, path1: Path, path2: Path): Option[Stream[(String,String)]] = {
Try(using(new BufferedReader(new InputStreamReader(hdfs.open(path1))))(readFileStream))
} match {
case Success(result) => {
}
case Failure(ex) => {
println(s"Could not read file $path1, detail ${ex.getClass.getName}:${ex.getMessage}")
None
}
}
def readFileStream(br: BufferedReader)= {
for {
line <- Try(br.readLine())
if (line != null )
} yield line
}
}
I am struck here. Any help please.
Thanks,
I am writing a REST API using Scala and Akka Http.
For request handling, I implement Routes in the form of RequestContext => Future[RouteResult]
In each route, I have to pass the context of the user from function to function, which becomes a bit cumbersome. For example, in the code below, I always have to pass userContext into every function that interacts with the DB.
val route: Route = {
requestContext => {
val userContext = extractUser(requestContext)
val computeResult = compute(userContext)
requestContext.complete(computeResult)
}
}
However, if I set the context to a singleton, then it runs the risk of being overridden when the next calls comes in, since the API is multi-tenant.
Is there a better way to handle this?
Give an opportunity to Reader Monad. That will allow you a better abstraction over your UserContext.
This could be an aprox:
import scala.concurrent.Future
import scala.util.{Failure, Success}
import scalaz.{Reader, ReaderT}
trait UserContext {
val user: String
val passwd: String
}
trait YourDBFunctionsII[T] {
def compute(): Reader[UserContext, T]
def computeII(): Reader[UserContext, T]
}
object YourDBFunctionsII extends YourDBFunctionsII[String] {
override def compute(): Reader[UserContext, String] = Reader {
in: UserContext =>
???
}
override def computeII(): Reader[UserContext, String] = Reader {
in: UserContext =>
???
}
}
class YourRoutesII {
import YourDBFunctionsII._
val route: Route = { requestContext =>
{
val userContext: UserContext = ??? // Extract from RequestContext
val routines = for {
resul1 <- compute()
resul2 <- computeII()
} yield resul2
// Execute monad composition
val computeResult = routines.run(userContext)
requestContext.complete(computeResult)
}
}
}
If you need to deal with an asynchronous database driver you can use ScalaZ ReaderT type:
trait YourDBFunctionsIII[T] {
def compute(): ReaderT[Future, UserContext, T]
def computeII(): ReaderT[Future, UserContext, T]
}
// In case you want to deal with Futures
object YourDBFunctionsIII extends YourDBFunctionsIII[String] {
override def compute(): ReaderT[Future, UserContext, String] = ReaderT {
ctx =>
Future {
// Do something
ctx.passwd
}.recover {
case e: Throwable =>
"Error"
}
}
override def computeII(): ReaderT[Future, UserContext, String] = ReaderT {
ctx =>
Future {
// Do other thing
ctx.passwd
}
}
}
class YourRoutesIII {
import YourDBFunctionsIII._
val route: Route = { requestContext =>
{
val userContext: UserContext = ??? // Extract from RequestContext
val routines = for {
resul1 <- compute()
resul2 <- computeII()
} yield resul2
// Execute monad composition
val computeResult = routines.run(userContext)
requestContext.complete(computeResult)
}
}
}
In both cases you only need to run the monad composition with the UserContext instance.
When using Neo4j unmanaged extensions, one can stream results to the client while traversing the graph like this (in Scala):
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}
val stream: StreamingOutput = ???
Response.ok().entity(stream).`type`(MediaType.APPLICATION_JSON).build()
I can't find a similar possibility when using Neo4j 3 used-defined stored procedures. They return Java 8 Streams but I can't see how I could add elements to such streams while they already being consumed, in parallel.
Is it possible?
I have an example of that in one of the APOC procedures.
https://github.com/neo4j-contrib/neo4j-apoc-procedures/blob/master/src/main/java/apoc/cypher/Cypher.java#L77
I want to add more / a more general example of that in the future.
Here is what I came up with based on Michael Hunger code (in Scala).
QueueBasedSpliterator:
import java.util.Spliterator
import java.util.concurrent.{BlockingQueue, TimeUnit}
import java.util.function.Consumer
import org.neo4j.kernel.api.KernelTransaction
private class QueueBasedSpliterator[T](queue: BlockingQueue[T],
tombstone: T,
tx: KernelTransaction) extends Spliterator[T] {
override def tryAdvance(action: Consumer[_ >: T]): Boolean =
try {
if (tx.shouldBeTerminated()) false
else {
val entry = queue.poll(100, TimeUnit.MILLISECONDS)
if (entry == null || entry == tombstone) false
else {
action.accept(entry)
true
}
}
} catch {
case e: InterruptedException => false
}
override def trySplit(): Spliterator[T] = null
override def estimateSize(): Long = Long.MaxValue
override def characteristics(): Int = Spliterator.ORDERED | Spliterator.NONNULL
}
Notice the 100 ms timeout value. Might require tuning.
ResultsStream (wrapper around blocking queue):
import java.util.concurrent.BlockingQueue
class ResultsStream[T](tombstone: T, queue: BlockingQueue[T]) extends AutoCloseable {
def put(t: T): Unit = {
queue.put(t)
}
override def close(): Unit = {
put(tombstone)
}
}
CommonUtil helper methods:
import java.util.concurrent.ArrayBlockingQueue
import java.util.stream.{Stream, StreamSupport}
import org.neo4j.kernel.api.KernelTransaction
import org.neo4j.kernel.internal.GraphDatabaseAPI
import scala.concurrent.{ExecutionContext, Future}
object CommonUtil {
def inTx(db: GraphDatabaseAPI)(f: => Unit): Unit =
Managed(db.beginTx()) { tx => f; tx.success() }
def inTxFuture(db: GraphDatabaseAPI)(f: => Unit)(implicit ec: ExecutionContext): Future[Unit] =
Future(inTx(db)(f))
def streamResults[T](tombstone: T, tx: KernelTransaction)
(f: ResultsStream[T] => Any): Stream[T] = {
val queue = new ArrayBlockingQueue[T](100)
f(new ResultsStream(tombstone, queue))
StreamSupport.stream(new QueueBasedSpliterator[T](queue, tombstone, tx), false)
}
}
Some more helpers:
object Managed {
type AutoCloseableView[T] = T => AutoCloseable
def apply[T : AutoCloseableView, V](resource: T)(op: T => V): V =
try {
op(resource)
} finally {
resource.close()
}
}
Pool:
import java.util.concurrent.{ArrayBlockingQueue, ThreadPoolExecutor, TimeUnit}
import scala.concurrent.{ExecutionContext, ExecutionContextExecutor}
object Pool {
lazy val DefaultExecutionContent: ExecutionContextExecutor =
ExecutionContext.fromExecutor(createDefaultExecutor())
// values might be tuned in production
def createDefaultExecutor(corePoolSize: Int = Runtime.getRuntime.availableProcessors() * 2,
keepAliveSeconds: Int = 30) = {
val queueSize = corePoolSize * 25
new ThreadPoolExecutor(
corePoolSize / 2,
corePoolSize,
keepAliveSeconds.toLong,
TimeUnit.SECONDS,
new ArrayBlockingQueue[Runnable](queueSize),
new ThreadPoolExecutor.CallerRunsPolicy()
)
}
}
Usage in a procedure:
#Procedure("example.readStream")
def readStream(#Name("nodeId") nodeId: NodeId): Stream[StreamingItem] =
CommonUtil.streamResults(StreamingItem.Tombstone, kernelTx) { results =>
CommonUtil.inTxFuture(db) { // uses Pool.DefaultExecutionContent
Managed(results) { _ =>
graphUtil.findTreeNode(nodeId).foreach { node =>
// add elements to the stream here
results.put(???)
}
}
}
}
StreamingItem.Tombstone is just a static StreamingItem instance with special meaning to close the stream. db and kernelTx are just context variable set by Neo4j:
#Context
public GraphDatabaseAPI db;
#Context
public KernelTransaction kernelTx;
I'm creating an async library using Scala 2.10 futures. The constructor for the library takes a sequence of user-defined objects that implement a certain trait, and then a method on the library class sends some data one-by-one into the user-defined objects. I want the user to provide the ExecutionContext for the async operations when setting up the main instance, and then for that context to get passed into the user-defined objects as necessary. Simplified (pseudo?)code:
case class Response(thing: String)
class LibraryObject(stack: Seq[Processor])(implicit context: ExecutionContext) {
def entryPoint(data: String): Future[Response] = {
val response = Future(Response(""))
stack.foldLeft(response) { (resp, proc) => proc.process(data, resp) }
}
}
trait Processor {
def process(data: String, resp: Future[Response]): Future[Response]
}
It might be used something like this:
class ThingProcessor extends Processor {
override def process(data: String, response: Future[Response]) = {
response map { _.copy(thing = "THE THING") }
}
}
class PassThroughProcessor extends Processor {
override def process(request: Request, response: Future[Response]) = {
response
}
}
object TheApp extends App {
import ExecutionContext.Implicits.global
val stack = List(
new ThingProcessor,
new PassThroughProcessor
)
val libObj = new LibraryObject(stack)
val futureResponse = libObj.entryPoint("http://some/url")
// ...
}
I get a compile error for ThingProcessor:
Cannot find an implicit ExecutionContext, either require one yourself or import ExecutionContext.Implicits.global
My question is, how do I implicitly supply the ExecutionContext that LibraryObject has to the user-defined objects (ThingProcessor and PassThroughProcessor) or their methods without making the user (who will be writing the classes) worry about it--that is to say, I would prefer that the user did not have to type:
class MyFirstProcessor(implicit context: ExecutionContext)
or
override def process(...)(implicit context: ExecutionContext) = { ... }
The implicit scope includes companion objects and type parameters of base classes.
Or, library.submit(new library.Processor { def process() ... }).
This works, but wasn't my first thought, which was to be more clever:
import concurrent._
import concurrent.duration._
class Library(implicit xc: ExecutionContext = ExecutionContext.global) {
trait Processor {
implicit val myxc: ExecutionContext = xc
def process(i: Future[Int]): Future[Int]
}
def submit(p: Processor) = p process future(7)
}
object Test extends App {
val library = new Library
val p = new library.Processor {
def process(i: Future[Int]) = for (x <- i) yield 2 * x
}
val res = library submit p
val z = Await result (res, 10.seconds)
Console println z
}
Update:
import concurrent._
import concurrent.duration._
import java.util.concurrent.Executors
class Library()(implicit xc: ExecutionContext = ExecutionContext.global) {
trait Processor {
implicit val myxc: ExecutionContext = xc
def process(i: Future[Int]): Future[Int]
}
def submit(p: Processor) = p process future(7)
}
object ctx {
val xc = ExecutionContext fromExecutorService Executors.newSingleThreadExecutor
}
object library1 extends Library
object library2 extends Library()(ctx.xc)
object p extends library1.Processor {
def process(i: Future[Int]) = for (x <- i) yield 2 * x
}
object q extends library2.Processor {
def process(i: Future[Int]) = for (x <- i) yield 3 * x
}
object Test extends App {
val res = library1 submit p
//val oops = library2 submit p
//val oops = library1 submit q
val z = Await result (res, 10.seconds)
Console println z
Console println (Await result (library2 submit q, 10.seconds))
ctx.xc.shutdownNow()
}
It isn't much of a stretch to:
class Library(implicit xc: ExecutionContext = ExecutionContext.global) {
def submit(p: Processor): Future[Int] = p dueProcess future(7)
}
trait Processor {
implicit var myxc: ExecutionContext = _
def dueProcess(i: Future[Int])(implicit xc: ExecutionContext) = {
myxc = xc
process(i)
}
protected def process(i: Future[Int]): Future[Int]
}
object ctx {
val xc = ExecutionContext fromExecutorService Executors.newSingleThreadExecutor
}
object Test extends App {
def db() = Console println (new Throwable().getStackTrace mkString ("TRACE [\n ", "\n ", "\n]"))
val library = new Library()(ctx.xc)
val p = new Processor {
protected def process(i: Future[Int]) = for (x <- i) yield { db(); 2 * x }
}
val res = library submit p
val z = Await result (res, 10.seconds)
Console println z
ctx.xc.shutdownNow()
}