(* Still to do: + Better error messages for ill-formed s-expressions *) module type SEXP = sig type sexp = Int of int | Flt of float | Str of string | Chr of char | Sym of string | Seq of sexp list exception IllFormedSexp of string val stringToSexp : string -> sexp val stringToSexps : string -> sexp list val fileToSexp : string -> sexp val fileToSexps : string -> sexp list val sexpToString : sexp -> string val sexpToString' : int -> sexp -> string val sexpsToString : sexp list -> string val sexpToFile : sexp -> string -> unit val readSexp : unit -> sexp end module Sexp : SEXP = struct type sexp = Int of int | Flt of float | Str of string | Chr of char | Sym of string | Seq of sexp list (************************************************************ Unparsing -- i.e, turning an s-expression into a string. It is easy to produce a single-line string, but in practice is nicer to "pretty-print" the s-expression on several lines when it doesn't fit the terminal width (typically 80 characters). Below is a simple backtracking pretty-printing algorithm; there are much fancier ones that do a better job and/or do not backtrack. ************************************************************) let rec sexpsToString sexps = String.concat "\n\n" (List.map sexpToString sexps) and sexpToString sexp = sexpToString' 80 sexp and sexpToString' width sexp = String.concat "\n" (sexpToStrings width sexp) (* Unparse sexp as a list of pretty-printed lines *) (* When possible, try to have all lines be <= width chars wide *) and sexpToStrings width sexp = match sexp with (* For leaf tokens (ints, floats, symbols, chars, strings) we "lose" if they're bigger than width. No way to address this. *) Int i -> [string_of_int i] | Flt f -> [string_of_float f] | Sym s -> [s] | Chr c -> (* Don't introduce escape sequences *) ["\'" ^ (String.make 1 c) ^ "\'"] | Str s -> (* Don't introduce escape sequences *) ["\"" ^ s ^ "\""] | Seq [] -> ["()"] | Seq (sexp1::sexps) -> match sexpToStrings (width - 1) (* account for "(" *) sexp1 with [s1] -> (* First sexp fits on single line. Try to get format (s1 s2 s3 ... sn) or (s1 s2 s3 ... sn ) *) squeeze width s1 sexps | strs -> (* Resort to shape (s1 s2 s3 ... sn ) *) seqToStrings (strs @ (List.concat (List.map (sexpToStrings (width - 1)) sexps))) and squeeze width s1 sexps = let len1 = String.length s1 in let rest1 = List.concat (List.map (sexpToStrings (width - len1 - 2)) sexps) in if (len1 + (totalLen rest1) + 3) <= width then (* everything fits on one line. The "3" accounts for initial '(', the ' ' between s1 and rest1 (if rest1 is non-empty) and the final ')' *) ["(" ^ (String.concat " " (s1::rest1)) ^ ")"] else if (len1 + (maxLen rest1) + 2) <= width then (* everything fits into shape (s1 s2 s3 ... sn ) The "2" accounts for the initial '(' and the ' ' between s1 and rest1 *) (match rest1 with [] -> ["(" ^ s1 ^ ")"] | (str1 :: strs) -> ("(" ^ s1 ^ " " ^ str1) :: (List.map (prefix (len1 + 2)) (* 1 for '(', 1 for ' ' *) (strs @ [")"])) ) else (* must resort to shape (s1 s2 s3 ... sn ) This requires backtracking on assumed width *) seqToStrings (s1 :: (List.concat (List.map (sexpToStrings (width - 1)) sexps))) and seqToStrings strings = match strings with [] -> ["()"] | (fst :: rst) -> ("(" ^ fst) :: ((List.map (fun s -> " " ^ s) rst) @ [" )"]) and prefix n str = (* prefix str with n spaces *) (String.make n ' ') ^ str and totalLen strs = (* Total length of strings in list of strings *) (* Count 1 for spaces between strings *) (List.length strs) - 1 + (List.fold_left (fun n str -> n + (String.length str)) 0 strs) and maxLen strs = (* Length of longest string in list of strings *) List.fold_left (fun n str -> max n (String.length str)) 0 strs (************************************************************ Scanning strings into s-expression tokens ************************************************************) type token = | ATOM of string | STRING of string | CHAR of char | LPAREN | RPAREN let tokenToString tok = match tok with ATOM s -> "{ATOM " ^ s ^ "}" | STRING s -> "{STRING \"" ^ s ^ "\"}" | CHAR c -> "{CHAR \'" ^ (String.make 1 c) ^ "\'}" | LPAREN -> "{LPAREN}" | RPAREN -> "{RPAREN}" let tokensToString toks = match toks with [] -> "[]" | [t] -> "[" ^ (tokenToString t) ^ "]" | (t::ts) -> "[" ^ (tokenToString t) ^ (List.fold_right (fun t s -> (", " ^ (tokenToString t) ^ s)) ts "]") exception IllFormedSexp of string (* Note: stringToTokens is a very compelling example of block structure -- e.g. definining local functions inside another function definition. *) let stringToTokens s = let len = String.length s in let rec scanTokens i = if i >= len then [] else match String.get s i with ' ' | '\t' | '\n' | '\r' | '\b' -> scanTokens(i+1) (* ignore whitespace *) | '(' -> (LPAREN :: scanTokens(i+1)) | ')' -> (RPAREN :: scanTokens(i+1)) | '"' -> scanString (i+1) (i+1) [] (* start of string *) | '\'' -> scanChar (i+1) (* start of char *) | '{' -> scanBlockComment (i+1) 0 (* beginning of block comment *) | ';' -> scanLineComment (i+1) (* beginning of line comment *) | _ -> scanSymbol i (i+1) (* get a symbol *) and scanString start k revChars = (* Look for end of strings quotes. Handle escapes along the way. revChars is reversed list of chars seen so far *) if k >= len then raise (IllFormedSexp ("Sexp: input ended before end of string:\n" ^ (StringUtils.implode(List.rev revChars)))) else let c = String.get s k in if c = '\"' then (* close double-quote ending string *) STRING(StringUtils.implode(List.rev revChars))::scanTokens(k+1) else if c = '\\' then (* begin escape sequence *) if (k+1) >= len then raise (IllFormedSexp ("Sexp: input ended before end of string:\n" ^ (StringUtils.implode(List.rev (c::revChars))))) else scanString start (k+2) (escaped(String.get s (k+1))::revChars) else (* continue reading string *) scanString start (k+1) (c::revChars) and escaped c = (* convert escaped char to special char *) match c with 't' -> '\t' | 'n' -> '\n' | 'r' -> '\r' | 'b' -> '\b' | '\\' -> '\\' | '\"' -> '\"' | '\'' -> '\'' | _ -> raise (IllFormedSexp ("Sexp: unrecognized escape sequence:" ^ "\\" ^ (String.make 1 c))) and scanChar k = if k >= len then raise (IllFormedSexp "Sexp: input ended before end of char literal") else let c = String.get s k in if (c = '\\') then (* begin escape sequence *) if k+2 >= len then raise (IllFormedSexp "Sexp: input ended before end of char literal") else if (String.get s (k+2)) = '\'' then CHAR(escaped(String.get s (k+1)))::scanTokens(k+3) else raise (IllFormedSexp ("Sexp: ill-formed char literal: " ^ "'\\" ^ (String.sub s (k+1) 2))) else if k+1 >= len then raise (IllFormedSexp "Sexp: input ended before end of char literal") else if (String.get s (k+1)) = '\'' then CHAR(c)::scanTokens(k+2) else raise (IllFormedSexp ("Sexp: ill-formed char literal: " ^ "'" ^ (String.sub s k 2))) and scanLineComment k = if k >= len then [] else let c = String.get s k in if c = '\n' then scanTokens (k+1) else scanLineComment (k+1) and scanBlockComment k nestLevel = (* Ignore characters in comments. Handle nesting levels appropriately to avoid summary execution by lyn. *) if k >= len then raise (IllFormedSexp "Sexp: input ended before end of block comment") else let c = String.get s k in if c = '}' then if nestLevel = 0 then scanTokens (k+1) else scanBlockComment (k+1) (nestLevel - 1) else if c = '{' then scanBlockComment (k+1) (nestLevel + 1) else scanBlockComment (k+1) nestLevel and scanSymbol start k = if k >= len then [ATOM(String.sub s start (k-start))] else let c = String.get s k in if List.memq c [' ';'\t';'\n';'\r';'\b';'(';')';'{'] then (ATOM(String.sub s start (k-start))::scanTokens(k)) else scanSymbol start (k+1) in scanTokens 0 (************************************************************ Parsing s-expression tokens into s-expresions ************************************************************) (* fromToks : token list -> (sexp * token list) *) let rec fromToks toks = match toks with [] -> raise (IllFormedSexp "Sexp: no tokens") | (STRING(s)::ts) -> (Str(s), ts) | (CHAR(c)::ts) -> (Chr(c), ts) | (ATOM(s)::ts) -> (try (Int(int_of_string(s)), ts) with (Failure("int_of_string")) -> (try (Flt(float_of_string(s)), ts) with (Failure("float_of_string")) -> (Sym(s), ts))) | (LPAREN::ts) -> (match fromToksList ts with (sexps,ts') -> (Seq(sexps), ts')) | (RPAREN::_) -> raise (IllFormedSexp "Sexp: unmatched right paren") (* fromToksList : token list -> (sexp list * token list) *) (* Collects all sexps before next right paren *) and fromToksList toks = match toks with [] -> raise (IllFormedSexp "Sexp: no tokens") | (RPAREN::ts) -> ([],ts) | _ -> (match fromToks toks with (sexp,ts) -> (match fromToksList ts with (sexps,ts') -> (sexp::sexps,ts'))) let stringToSexp s = match fromToks (stringToTokens s) with (sexp, []) -> sexp | (sexp, toks) -> raise (IllFormedSexp ("Sexp: extra tokens\n" ^ (tokensToString toks) ^ "\n\nafter end of sexp:\n\n" ^ (sexpToString sexp))) let fileToSexp filename = stringToSexp(File.fileToString(filename)) let rec stringToSexps s = toksToSexps (stringToTokens s) and toksToSexps toks = match toks with [] -> [] | _ -> (match fromToks toks with (sexp, toks') -> sexp :: (toksToSexps toks')) let fileToSexps filename = stringToSexps(File.fileToString(filename)) let sexpToFile sexp filename = File.stringToFile (sexpToString sexp) filename (* Returns true if str is a single sexp and false otherwise *) let isSexp str = try (let _ = stringToSexp str in true) with IllFormedSexp _ -> false (* Read lines from standard input until have a complete s-expression *) let readSexp () = let rec loop str = try stringToSexp str with IllFormedSexp _ -> loop (str ^ (read_line())) in loop "" end