parser.scm
| 1 | ;;;; Copyright (C) 2020 Julien Lepiller <julien@lepiller.eu> |
| 2 | ;;;; |
| 3 | ;;;; This library is free software; you can redistribute it and/or |
| 4 | ;;;; modify it under the terms of the GNU Lesser General Public |
| 5 | ;;;; License as published by the Free Software Foundation; either |
| 6 | ;;;; version 3 of the License, or (at your option) any later version. |
| 7 | ;;;; |
| 8 | ;;;; This library is distributed in the hope that it will be useful, |
| 9 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 11 | ;;;; Lesser General Public License for more details. |
| 12 | ;;;; |
| 13 | ;;;; You should have received a copy of the GNU Lesser General Public |
| 14 | ;;;; License along with this library; if not, write to the Free Software |
| 15 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 16 | ;;;; |
| 17 | |
| 18 | (define-module (turtle parser) |
| 19 | #:use-module (ice-9 peg) |
| 20 | #:export (parse-turtle)) |
| 21 | |
| 22 | ;; Productions for terminals |
| 23 | ;; [18] IRIREF ::= '<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' /* #x00=NULL #01-#x1F=control codes #x20=space */ |
| 24 | (define-peg-pattern iriref all |
| 25 | (and (ignore "<") |
| 26 | (* (or "!" (range #\x23 #\x3b) "=" (range #\x3f #\x5b) "]" "_" |
| 27 | (range #\x61 #\x7a) (range #\x7e #\x10ffff) uchar)) |
| 28 | (ignore ">"))) |
| 29 | ;; [139s] PNAME_NS ::= PN_PREFIX? ':' |
| 30 | (define-peg-pattern pname-ns all (and (? pn-prefix) (ignore ":"))) |
| 31 | ;; [140s] PNAME_LN ::= PNAME_NS PN_LOCAL |
| 32 | (define-peg-pattern pname-ln all (and pname-ns pn-local)) |
| 33 | ;; [141s] BLANK_NODE_LABEL ::= '_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)? |
| 34 | (define-peg-pattern blank-node-label all |
| 35 | (and "_:" (or pn-chars-u (range #\0 #\9)) (* (and (* ".") pn-chars)))) |
| 36 | ;; [144s] LANGTAG ::= '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)* |
| 37 | (define-peg-pattern langtag all |
| 38 | (and "@" (+ (or (range #\a #\z) (range #\A #\Z))) |
| 39 | (* (and "-" (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9)))))) |
| 40 | ;; [19] INTEGER ::= [+-]? [0-9]+ |
| 41 | (define-peg-pattern integer all (and (? (or "+" "-")) (+ (range #\0 #\9)))) |
| 42 | ;; [20] DECIMAL ::= [+-]? [0-9]* '.' [0-9]+ |
| 43 | (define-peg-pattern decimal all |
| 44 | (and (? (or "+" "-")) (* (range #\0 #\9)) "." (+ (range #\0 #\9)))) |
| 45 | ;; [21] DOUBLE ::= [+-]? ([0-9]+ '.' [0-9]* EXPONENT | '.' [0-9]+ EXPONENT | [0-9]+ EXPONENT) |
| 46 | (define-peg-pattern double all |
| 47 | (and (? (or "+" "-")) |
| 48 | (or (and (+ (range #\0 #\9)) "." (* (range #\0 #\9)) exponent) |
| 49 | (and "." (+ (range #\0 #\9)) exponent) |
| 50 | (and (+ (range #\0 #\9)) exponent)))) |
| 51 | ;; [154s] EXPONENT ::= [eE] [+-]? [0-9]+ |
| 52 | (define-peg-pattern exponent body |
| 53 | (and (or "e" "E") (? (or "+" "-")) (+ (range #\0 #\9)))) |
| 54 | ;; [22] STRING_LITERAL_QUOTE ::= '"' ([^#x22#x5C#xA#xD] | ECHAR | UCHAR)* '"' /* #x22=" #x5C=\ #xA=new line #xD=carriage return */ |
| 55 | (define-peg-pattern string-literal-quote all |
| 56 | (and (ignore "\"") |
| 57 | (* (or (range #\x00 #\x09) (range #\x0b #\x0c) (range #\x0d #\x21) |
| 58 | (range #\x23 #\x5b) (range #\x5d #\x10ffff) echar uchar)) |
| 59 | (ignore "\""))) |
| 60 | ;; [23] STRING_LITERAL_SINGLE_QUOTE ::= "'" ([^#x27#x5C#xA#xD] | ECHAR | UCHAR)* "'" /* #x27=' #x5C=\ #xA=new line #xD=carriage return */ |
| 61 | (define-peg-pattern string-literal-single-quote all |
| 62 | (and (ignore "'") |
| 63 | (* (or (range #\x00 #\x09) (range #\x0b #\x0c) (range #\x0d #\x26) |
| 64 | (range #\x28 #\x5b) (range #\x5d #\x10ffff) echar uchar)) |
| 65 | (ignore "'"))) |
| 66 | ;; [24] STRING_LITERAL_LONG_SINGLE_QUOTE ::= "'''" (("'" | "''")? ([^'\] | ECHAR | UCHAR))* "'''" |
| 67 | (define-peg-pattern string-literal-long-single-quote all |
| 68 | (and (ignore "'''") |
| 69 | (* (and (? (or "''" "'")) |
| 70 | (or (range #\x00 #\x26) (range #\x28 #\x5b) |
| 71 | (range #\x5d #\x10ffff) echar uchar))) |
| 72 | (ignore "'''"))) |
| 73 | ;; [25] STRING_LITERAL_LONG_QUOTE ::= '"""' (('"' | '""')? ([^"\] | ECHAR | UCHAR))* '"""' |
| 74 | (define-peg-pattern string-literal-long-quote all |
| 75 | (and (ignore "\"\"\"") |
| 76 | (* (and (? (or "\"\"" "\"")) |
| 77 | (or (range #\x00 #\x21) (range #\x23 #\x5b) |
| 78 | (range #\x5d #\x10ffff) echar uchar))) |
| 79 | (ignore "\"\"\""))) |
| 80 | ;; [26] UCHAR ::= '\u' HEX HEX HEX HEX | '\U' HEX HEX HEX HEX HEX HEX HEX HEX |
| 81 | (define-peg-pattern uchar body |
| 82 | (or (and "\\u" hex hex hex hex) |
| 83 | (and "\\U" hex hex hex hex hex hex hex hex))) |
| 84 | ;; [159s] ECHAR ::= '\' [tbnrf"'\] |
| 85 | (define-peg-pattern echar body |
| 86 | (or "\\t" "\\b" "\\n" "\\r" "\\f" "\\\"" "\\'" "\\\\")) |
| 87 | ;; [161s] WS ::= #x20 | #x9 | #xD | #xA /* #x20=space #x9=character tabulation #xD=carriage return #xA=new line */ |
| 88 | (define-peg-pattern ws body (or " " "\t" "\r" "\n")) |
| 89 | ;; [162s] ANON ::= '[' WS* ']' |
| 90 | (define-peg-pattern anon all (and "[" (* ws) "]")) |
| 91 | ;; [163s] PN_CHARS_BASE ::= [A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] |
| 92 | (define-peg-pattern pn-chars-base body |
| 93 | (or (range #\A #\Z) (range #\a #\z) (range #\x00c0 #\x00d6) |
| 94 | (range #\x00d8 #\x00f6) (range #\x00f8 #\x02ff) (range #\x0370 #\x037d) |
| 95 | (range #\x037f #\x1fff) (range #\x200c #\x200d) (range #\x2070 #\x218f) |
| 96 | (range #\x2c00 #\x2fef) (range #\x3001 #\xd7ff) (range #\xf900 #\xfdcf) |
| 97 | (range #\xfdf0 #\xfffd) (range #\x10000 #\xeffff))) |
| 98 | ;; [164s] PN_CHARS_U ::= PN_CHARS_BASE | '_' |
| 99 | (define-peg-pattern pn-chars-u body (or pn-chars-base "_")) |
| 100 | ;; [166s] PN_CHARS ::= PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040] |
| 101 | (define-peg-pattern pn-chars body |
| 102 | (or pn-chars-u "-" (range #\0 #\9) "ยท" (range #\x0300 #\x036f) |
| 103 | (range #\x203f #\x2040))) |
| 104 | ;; [167s] PN_PREFIX ::= PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)? |
| 105 | (define-peg-pattern pn-prefix body |
| 106 | (and pn-chars-base (* (and (* ".") pn-chars)))) |
| 107 | ;; [168s] PN_LOCAL ::= (PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))? |
| 108 | (define-peg-pattern pn-local body |
| 109 | (and (or pn-chars-u ":" (range #\0 #\9) plx) |
| 110 | (* (and (* ".") (or pn-chars ":" plx))))) |
| 111 | ;; [169s] PLX ::= PERCENT | PN_LOCAL_ESC |
| 112 | (define-peg-pattern plx body (or percent pn-local-esc)) |
| 113 | ;; [170s] PERCENT ::= '%' HEX HEX |
| 114 | (define-peg-pattern percent body (and "%" hex hex)) |
| 115 | ;; [171s] HEX ::= [0-9] | [A-F] | [a-f] |
| 116 | (define-peg-pattern hex body (or (range #\0 #\9) (range #\a #\f) (range #\A #\F))) |
| 117 | ;; [172s] PN_LOCAL_ESC ::= '\' ('_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%') |
| 118 | (define-peg-pattern pn-local-esc body |
| 119 | (and "\\" (or "~" "." "-" "!" "$" "&" "'" "(" ")" "*" "+" "," ";" "=" "/" |
| 120 | "?" "#" "@" "%" "_"))) |
| 121 | |
| 122 | (define-peg-pattern comment body (and "#" (* (or (range #\x00 #\x09) |
| 123 | (range #\x0B #\x0C) |
| 124 | (range #\x0E #\x10FFFF))))) |
| 125 | (define-peg-pattern WS none (* (or comment ws))) |
| 126 | (define-peg-pattern unrecognized body (range #\x00 #\x10ffff)) |
| 127 | |
| 128 | |
| 129 | ;; [1] turtleDoc ::= statement* |
| 130 | (define-peg-pattern turtle-doc body (and WS (* (and statement WS)))) |
| 131 | ;; [2] statement ::= directive | triples '.' |
| 132 | (define-peg-pattern statement body |
| 133 | (or directive (and triples WS (ignore ".")) (* unrecognized))) |
| 134 | ;; [3] directive ::= prefixID | base | sparqlPrefix | sparqlBase |
| 135 | (define-peg-pattern directive body (or prefix-id base sparql-prefix sparql-base)) |
| 136 | ;; [4] prefixID ::= '@prefix' PNAME_NS IRIREF '.' |
| 137 | (define-peg-pattern prefix-id all |
| 138 | (and (ignore "@prefix") WS pname-ns WS iriref WS (ignore "."))) |
| 139 | ;; [5] base ::= '@base' IRIREF '.' |
| 140 | (define-peg-pattern base all |
| 141 | (and (ignore "@base") WS iriref WS (ignore "."))) |
| 142 | ;; [5s] sparqlBase ::= "BASE" IRIREF |
| 143 | (define-peg-pattern sparql-base all |
| 144 | (and (ignore (and (or "b" "B") (or "a" "A") (or "s" "S") (or "e" "E"))) |
| 145 | WS iriref)) |
| 146 | ;; [6s] sparqlPrefix ::= "PREFIX" PNAME_NS IRIREF |
| 147 | (define-peg-pattern sparql-prefix all |
| 148 | (and (ignore (and (or "p" "P") (or "r" "R") (or "e" "E") (or "f" "F") |
| 149 | (or "i" "I") (or "x" "X"))) |
| 150 | WS pname-ns WS iriref)) |
| 151 | ;; [6] triples ::= subject predicateObjectList | blankNodePropertyList predicateObjectList? |
| 152 | (define-peg-pattern triples all |
| 153 | (or (and subject WS predicate-object-list) |
| 154 | (and blank-node-property-list WS (? predicate-object-list)))) |
| 155 | ;; [7] predicateObjectList ::= verb objectList (';' (verb objectList)?)* |
| 156 | (define-peg-pattern predicate-object-list all |
| 157 | (and verb WS object-list |
| 158 | (* (and WS (ignore ";") WS (? (and verb WS object-list)))))) |
| 159 | ;; [8] objectList ::= object (',' object)* |
| 160 | (define-peg-pattern object-list all |
| 161 | (and object (* (and WS (ignore ",") WS object)))) |
| 162 | ;; [9] verb ::= predicate | 'a' |
| 163 | (define-peg-pattern verb all (or predicate "a")) |
| 164 | ;; [10] subject ::= iri | BlankNode | collection |
| 165 | (define-peg-pattern subject all (or iri blank-node collection)) |
| 166 | ;; [11] predicate ::= iri |
| 167 | (define-peg-pattern predicate all iri) |
| 168 | ;; [12] object ::= iri | BlankNode | collection | blankNodePropertyList | literal |
| 169 | (define-peg-pattern object all |
| 170 | (or iri blank-node collection blank-node-property-list literal)) |
| 171 | ;; [13] literal ::= RDFLiteral | NumericLiteral | BooleanLiteral |
| 172 | (define-peg-pattern literal body |
| 173 | (or rdf-literal numeric-literal boolean-literal)) |
| 174 | ;; [14] blankNodePropertyList ::= '[' predicateObjectList ']' |
| 175 | (define-peg-pattern blank-node-property-list all |
| 176 | (and (ignore "[") WS predicate-object-list WS (ignore "]"))) |
| 177 | ;; [15] collection ::= '(' object* ')' |
| 178 | (define-peg-pattern collection all |
| 179 | (and (ignore "(") WS (* (and object WS)) (ignore ")"))) |
| 180 | ;; [16] NumericLiteral ::= INTEGER | DECIMAL | DOUBLE |
| 181 | (define-peg-pattern numeric-literal all (or integer decimal double)) |
| 182 | ;; [128s] RDFLiteral ::= String (LANGTAG | '^^' iri)? |
| 183 | (define-peg-pattern rdf-literal all |
| 184 | (and string-pat WS (? (or langtag (and "^^" WS iri))))) |
| 185 | ;; [133s] BooleanLiteral ::= 'true' | 'false' |
| 186 | (define-peg-pattern boolean-literal all (or "true" "false")) |
| 187 | ;; [17] String ::= STRING_LITERAL_QUOTE | STRING_LITERAL_SINGLE_QUOTE | STRING_LITERAL_LONG_SINGLE_QUOTE | STRING_LITERAL_LONG_QUOTE |
| 188 | (define-peg-pattern string-pat all |
| 189 | (or string-literal-long-single-quote string-literal-long-quote |
| 190 | string-literal-quote string-literal-single-quote)) |
| 191 | ;; [135s] iri ::= IRIREF | PrefixedName |
| 192 | (define-peg-pattern iri all (or iriref prefixed-name)) |
| 193 | ;; [136s] PrefixedName ::= PNAME_LN | PNAME_NS |
| 194 | (define-peg-pattern prefixed-name all (or pname-ln pname-ns)) |
| 195 | ;; [137s] BlankNode ::= BLANK_NODE_LABEL | ANON |
| 196 | (define-peg-pattern blank-node all (or blank-node-label anon)) |
| 197 | |
| 198 | |
| 199 | |
| 200 | (define (parse-turtle str) |
| 201 | (peg:tree (match-pattern turtle-doc str))) |
| 202 |