sbcl encoding error only when executed from prompt? - unicode

I have a code that if executed from the slime prompt inside emacs run with no error. If I started sbcl from the prompt, I got the error:
* (ei:proc-file "BRAvESP000.log" "lixo")
debugger invoked on a SB-INT:STREAM-ENCODING-ERROR:
:UTF-8 stream encoding error on
#<SB-SYS:FD-STREAM for "file /Users/arademaker/work/IBM/scolapp/lixo"
{10049E8FF3}>:
the character with code 55357 cannot be encoded.
Type HELP for debugger help, or (SB-EXT:EXIT) to exit from SBCL.
restarts (invokable by number or by possibly-abbreviated name):
0: [OUTPUT-NOTHING ] Skip output of this character.
1: [OUTPUT-REPLACEMENT] Output replacement string.
2: [ABORT ] Exit debugger, returning to top level.
(SB-IMPL::STREAM-ENCODING-ERROR-AND-HANDLE #<SB-SYS:FD-STREAM for "file /Users/arademaker/work/IBM/scolapp/lixo" {10049E8FF3}> 55357)
0]
The mistery is that in both cases I am using the same sbcl 1.1.8 and the same machine, Mac OS 10.8.4. Any idea?
The code:
(defun proc-file (filein fileout &key (fn-convert #'identity))
(with-open-file (fout fileout
:direction :output
:if-exists :supersede
:external-format :utf8)
(with-open-file (fin filein :external-format :utf8)
(loop for line = (read-line fin nil)
while line
do
(handler-case
(let* ((line (ppcre:regex-replace "^.*{jsonTweet=" line "{\"jsonTweet\":"))
(data (gethash "jsonTweet" (yason:parse line))))
(yason:encode (funcall fn-convert (yason:parse data)) fout)
(format fout "~%"))
(end-of-file ()
(format *standard-output* "Error[~a]: ~a~%" filein line)))))))

This is almost certainly a bug in yason. JSON requires that if a non BMP character is escaped, it is done so through a surrogate pair. Here's a simple example with U+10000 (which is optionally escaped in json as "\ud800\udc00"; I use babel as babel's conversion is less strin):
(map 'list #'char-code (yason:parse "\"\\ud800\\udc00\""))
=> (55296 56320)
unicode code point 55296 (decimal) is the start for a surrogate pair, and should not appear except as a surrogate pair in UTF-16. Fortunately it can be easily worked around by using babel to encode the string to UTF-16 and back again:
(babel:octets-to-string (babel:string-to-octets (yason:parse "\"\\ud800\\udc00\"") :encoding :utf-16le) :encoding :utf-16le)
=> "𐀀"
You should be able to work around this by changing this line:
(yason:encode (funcall fn-convert (yason:parse data)) fout)
To use an intermediate string, which you convert to UTF-16 and back.
(write-sequence
(babel:octets-to-string
(babel:string-to-octets
(with-output-to-string (outs)
(yason:encode (funcall fn-convert (yason:parse data)) outs))
:encoding :utf-16le)
:encoding :utf-16le)
fout)
I submitted a patch that has been accepted to fix this in yason:
https://github.com/hanshuebner/yason/commit/4a9bdaae652b7ceea79984e0349a992a5458a0dc

Related

Binary pipes with SBCL

How do I set up binary pipes on the stdin and stdout of the subprocess using sb-ext:run-program? I'd like to have streams with element-type (unsigned-byte 8) to talk to the subprocess in Lisp.
run-program takes an :external-format argument but as far as I can tell it's only about text encodings, not binary. SBCL ships with a test program that does binary I/O but it defines a custom stream class using Gray streams, which seems advanced enough that there has to be an easier way.
Normally, sb-ext:run-program is responsible for creating intermediate streams when you pass the :stream option. The other answer shows that you can directly write bytes to it if you want to. But If you inspect how run-program is implemented, you can build the streams yourself by using the same functions called by run-program to generate an intermediate unix pipe, and read/write to it using binary streams.
(defpackage :so (:use :cl :alexandria))
(in-package :so)
Define an auxiliary function that closes a file descriptor while handling errors as warnings:
(defun unix-close/warn-on-error (file-descriptor)
(multiple-value-bind (status error) (sb-unix:unix-close file-descriptor)
(prog1 status
(unless (eql error 0)
(warn "Unix close error: ~S" error)))))
Then, a macro that temporarily create a unix pipe:
(defmacro with-unix-pipe ((read-fd write-fd) &body body)
(with-gensyms (first second)
`(multiple-value-bind (,first ,second) (sb-unix:unix-pipe)
(if ,first
(unwind-protect
(multiple-value-bind (,read-fd ,write-fd)
(values ,first ,second)
,#body)
(unix-close/warn-on-error ,first)
(unix-close/warn-on-error ,second))
(error "Unix pipe error: ~s" ,second)))))
However, run-program expects streams, not file descriptors. Here you have a macro that binds a variable to a stream tied to a file descriptor:
(defmacro with-fd-stream% ((var fd direction &rest fd-args) &body body)
(check-type direction (member :output :input))
(with-gensyms (in%)
`(let ((,in% (sb-sys:make-fd-stream ,fd ,direction t ,#fd-args)))
(unwind-protect (let ((,var ,in%))
(declare (dynamic-extent ,var))
,#body)
(close ,in%)))))
And the macro that does the same for a pair if in/out file descriptors:
(defmacro with-fd-streams (((in read-fd &rest read-args)
(out write-fd &rest write-args))
&body body)
`(with-fd-stream% (,in ,read-fd :input ,#read-args)
(with-fd-stream% (,out ,write-fd :output ,#write-args)
,#body)))
Finally, you can test your code with the following:
(let ((ub8 '(unsigned-byte 8)))
(with-unix-pipe (read write)
(with-fd-streams ((in read :element-type ub8)
(out write :element-type ub8))
(fresh-line)
(sb-ext:run-program "dd"
'("if=/dev/random" "count=1" "bs=64")
:search t
:output out
:error nil
:wait nil
:status-hook (lambda (p)
(unless (sb-ext:process-alive-p p)
(close out))))
(sb-ext:run-program "hd"
'()
:search t
:input in
:output *standard-output*
:wait t))))
The first test that you linked already seems to show that you can simply send bytes to the streams created with :input :stream and :output :stream.
I'd suggest using uiop:launch-program instead for portability:
(let ((pri (uiop:launch-program "cat" :input :stream :output :stream)))
(write-byte 43 (uiop:process-info-input pri))
(force-output (uiop:process-info-input pri))
(read-byte (uiop:process-info-output pri)))
=> 43

Simple vs complex user entry functions in Lisp

At this site: http://www.gigamonkeys.com/book/practical-a-simple-database.html there is user entry function listed as follows:
(defun prompt-read (prompt)
(format *query-io* "~%~a: " prompt)
(force-output *query-io*)
(read-line *query-io*))
Are there any major advantages of above function as compared to following simpler form:
(defun prompt-read2 (prompt)
(format t "~%~a: " prompt)
(setf answer (read-line)))
Is it recommended to always use force-output and *query-io* all the time?
Setting the answer to a global variable like that is bad. You should just return the answer and let the caller do what it wants with it. If you do use special (~global) variables, you should put asterisks around the name (*ANSWER* instead of ANSWER).
FORCE-OUTPUT is needed to ensure that the user actually sees the prompt before having to answer. If I run the second version using SBCL in a terminal, the program just freezes to wait for input without saying anything.
*QUERY-IO* should be used for querying things from the user, because some environment might want to handle that differently from other output. For example, someone might write a GUI wrapper for your program that turns the queries into graphical dialogs. Or maybe they want to run it as a part of a script, providing the input from a string.
(defun prompt-read (prompt)
(format *query-io* "~%~a: " prompt)
(force-output *query-io*)
(read-line *query-io*))
(defun hello ()
(format t "~&Hello ~a!~%" (prompt-read "What's your name")))
(defmacro with-input ((input) &body body)
`(let ((*query-io* (make-two-way-stream (make-string-input-stream ,input)
(make-string-output-stream))))
,#body))
(defun test ()
(with-input ("jkiiski")
(hello))
(with-input ("rnso")
(hello)))
(test)
; Hello jkiiski!
; Hello rnso!
Edit
A more complex example using SBCLs gray streams.
(defclass foo-stream (sb-gray:fundamental-character-input-stream)
((output-input-script :initarg :script :accessor foo-stream-script)
(output-stream :initarg :out :accessor foo-stream-out)
(current-input :initform nil :accessor foo-stream-current-input)))
(defmethod sb-gray:stream-read-char ((stream foo-stream))
(with-accessors ((input foo-stream-current-input)
(out foo-stream-out)
(script foo-stream-script)) stream
(when (or (null input)
(not (listen input)))
(let ((output (string-trim '(#\space #\newline)
(get-output-stream-string out))))
(setf input (make-string-input-stream
(format nil "~a~%"
(cdr (assoc output script :test #'string=)))))))
(read-char input)))
(defun prompt-read (prompt)
(format *query-io* "~%~a: " prompt)
(force-output *query-io*)
(read-line *query-io*))
(defun hello ()
(format t "~&Hello ~a!~%" (prompt-read "What's your name"))
(format t "~&I'm ~a too!" (prompt-read "How are you"))
(format t "~&~a~%" (if (string-equal (prompt-read
"Do you want to delete all your files")
"yes")
"Deleting all files... (not really)"
"Not deleting anything.")))
(defmacro with-input-script ((script) &body body)
(let ((out-sym (gensym "out")))
`(let* ((,out-sym (make-string-output-stream))
(*query-io* (make-two-way-stream
(make-instance 'foo-stream
:out ,out-sym
:script ,script)
,out-sym)))
,#body)))
(defun test ()
(with-input-script ('(("What's your name:" . "jkiiski")
("How are you:" . "great")
("Do you want to delete all your files:" . "No")))
(hello))
(with-input-script ('(("What's your name:" . "Foo Bar")
("How are you:" . "fine")
("Do you want to delete all your files:" . "Yes")))
(hello)))
(test)
; Hello jkiiski!
; I'm great too!
; Not deleting anything.
; Hello Foo Bar!
; I'm fine too!
; Deleting all files... (not really)
Yes you code is easy but the first is more clarifying what are you doing:
*query-io* is a global variable (which you can tell because of the * naming convention for global variables) that contains the input stream
connected to the terminal. The return value of prompt-read will be the
value of the last form, the call to READ-LINE, which returns the
string it read (without the trailing newline.)
This is what they said about *query-io*
And about the streams that you can put there works as follow:
most other I/O functions also accept T and NIL as stream designators
but with a different meaning: as a stream designator, T designates the
bidirectional stream *TERMINAL-IO*, while NIL designates
*STANDARD-OUTPUT* as an output stream and *STANDARD-INPUT* as an input stream
in this case it seems that this is only pointing to *standard-input* and not to the bidirectional stream t

symbolic expression stream I/O

In Common Lisp, how can one read & write symbolic expressions from/to streams? For example, I might want to write an anonymous function to file and then read and funcall it:
;;; sexp-io.lisp
;;; Try writing a sexp to file and reading it back in
(with-open-file (file "~/Documents/Lisp/Concurrency/sexp.lisp"
:direction :output :if-exists :supersede)
(print #'(lambda () (+ 1 1)) file))
(with-open-file (file "~/Documents/Lisp/Concurrency/sexp.lisp"
:direction :input)
(read file))
However, that code results in dubious output
#<Anonymous Function #x3020018F950F>
which does result in an error when I try reading it back in:
> Error: Reader error on #<BASIC-FILE-CHARACTER-INPUT-STREAM ("/Users/frank/Documents/Lisp/Concurrency/sexp.lisp"/7 UTF-8) #x3020018F559D>, near position 3, within "
> #<Anonymous ":
> "#<" encountered.
> While executing: CCL::SIGNAL-READER-ERROR, in process Listener(4).
You are doing TRT, except for #' which turns the list (lambda () (+ 1 1)) into a function object. Just replace the sharp-quote (which is read as function) with a simple quote (which is read as quote) and it should work.
Another change you might want to make is replacing print with write with argument :readably t:
(write my-object :stream out :readably t)
The benefit of :readably is that it fails if it cannot write in a way that will preserve print-read consistency.

Repeated calls to format ignore the ~t option

I have this:
(defun promptread (prompt)
(format *query-io* "~10t~a:" prompt)
(force-output *query-io*)
(read-line *query-io*))
(defun prompt-cd ()
(make-cd
(promptread "Artist")
(promptread "Album")
(promptread "Rating")
(promptread "Like [y/n]")))
It works, however the format ~10t only affects the first call to promptread inside make-cd; the others are left-aligned without this padding.
Why would this be?
REPL:
CL-USER> (addcds)
Artist:Dixie
Album:Funny
Rating:22
The first promptread is indented because of the format with ~10t but not the others, which use the same exact format call.
The problem is that after force-output and readline, the cursor is not known to formatto be at position 0. Thus absolute tabulation will fail. If you start the format string with ~&, you will see this as an additional newline will be outputted anyway.
To solve the problem use the # modifier to get relative tabulation:
(format *query-io* "~10#t~a:" prompt)

How to correct files with mixed encodings?

Given a corrupted file with mixed encoding (e.g. utf-8 and latin-1), how do I configure Emacs to "project" all its symbols to a single encoding (e.g. utf-8) when saving the file?
I did the following function to automatize some of the cleaning, but I would guess I could find somewhere the information to map the symbol "é" in one encoding to "é" in utf-8 somewhere in order to improve this function (or that somebody already wrote such a function).
(defun jyby/cleanToUTF ()
"Cleaning to UTF"
(interactive)
(progn
(save-excursion (replace-regexp "अ" ""))
(save-excursion (replace-regexp "आ" ""))
(save-excursion (replace-regexp "ॆ" ""))
)
)
(global-unset-key [f11])
(global-set-key [f11] 'jyby/cleanToUTF)
I have many files "corrupted" with mixed encoding (due to copy pasting from a browser with an ill font configuration), generating the error below. I sometime clean them by hand by searching and replacing for each problematic symbol by either "" or the appropriate character, or more quickly specifying "utf-8-unix" as the encoding (which will prompt the same message next time I edit and save the file). It has become an issue as in any such corrupted file any accentuated character is replaced by a sequence which doubles in size at each save, ending up doubling the size of the file. I am using GNU Emacs 24.2.1
These default coding systems were tried to encode text
in the buffer `test_accents.org':
(utf-8-unix (30 . 4194182) (33 . 4194182) (34 . 4194182) (37
. 4194182) (40 . 4194181) (41 . 4194182) (42 . 4194182) (45
. 4194182) (48 . 4194182) (49 . 4194182) (52 . 4194182))
However, each of them encountered characters it couldn't encode:
utf-8-unix cannot encode these: ...
Click on a character (or switch to this window by `C-x o'
and select the characters by RET) to jump to the place it appears,
where `C-u C-x =' will give information about it.
Select one of the safe coding systems listed below,
or cancel the writing with C-g and edit the buffer
to remove or modify the problematic characters,
or specify any other coding system (and risk losing
the problematic characters).
raw-text emacs-mule no-conversion
I have struggled with this in emacs many times. When I have a file that was messed up, e.g. in raw-text-unix mode, and save as utf-8, emacs complains even about text that is already clean utf-8. I haven't found a way to get it to only complain about non-utf-8.
I just found a reasonable semi-automated approach using recode:
f=mixed-file
recode -f ..utf-8 $f > /tmp/recode.out
diff $f recode.out | cat -vt
# manually fix lines of text that can't be converted to utf-8 in $f,
# and re-run recode and diff until the output diff is empty.
One helpful tool along the way is http://www.ltg.ed.ac.uk/~richard/utf-8.cgi?input=342+200+224&mode=obytes
Then I just re-open the file in emacs, and it is recognized as clean unicode.
Here's something to maybe get you started:
(put 'eof-error 'error-conditions '(error eof-error))
(put 'eof-error 'error-message "End of stream")
(put 'bad-byte 'error-conditions '(error bad-byte))
(put 'bad-byte 'error-message "Not a UTF-8 byte")
(defclass stream ()
((bytes :initarg :bytes :accessor bytes-of)
(position :initform 0 :accessor position-of)))
(defun logbitp (byte bit) (not (zerop (logand byte (ash 1 bit)))))
(defmethod read-byte ((this stream) &optional eof-error eof)
(with-slots (bytes position) this
(if (< position (length bytes))
(prog1 (aref bytes position) (incf position))
(if eof-error (signal eof-error (list position)) eof))))
(defmethod unread-byte ((this stream))
(when (> (position-of this) 0) (decf (position-of this))))
(defun read-utf8-char (stream)
(let ((byte (read-byte stream 'eof-error)))
(if (not (logbitp byte 7)) byte
(let ((numbytes
(cond
((not (logbitp byte 5))
(setf byte (logand #2r11111 byte)) 1)
((not (logbitp byte 4))
(setf byte (logand #2r1111 byte)) 2)
((not (logbitp byte 3))
(setf byte (logand #2r111 byte)) 3))))
(dotimes (b numbytes byte)
(let ((next-byte (read-byte stream 'eof-error)))
(if (and (logbitp next-byte 7) (not (logbitp next-byte 6)))
(setf byte (logior (ash byte 6) (logand next-byte #2r111111)))
(signal 'bad-byte (list next-byte)))))
(signal 'bad-byte (list byte))))))
(defun load-corrupt-file (file)
(interactive "fFile to load: ")
(with-temp-buffer
(set-buffer-multibyte nil)
(insert-file-literally file)
(with-output-to-string
(set-buffer-multibyte t)
(loop with stream = (make-instance 'stream :bytes (buffer-string))
for next-char =
(condition-case err
(read-utf8-char stream)
(bad-byte (message "Fix this byte %d" (cdr err)))
(eof-error nil))
while next-char
do (write-char next-char)))))
What this code does - it loads a file with no conversion and tries to read it as if it was encoded using UTF-8, once it encounters a byte that doesn't seem like it belongs to UTF-8, it errors, and you need to handle it somehow, it's where "Fix this byte" message is). But you would need to be inventive about how you fixing it...