guile-rdf/turtle/parser.scm

parser.scm

1
;;;; Copyright (C) 2020 Julien Lepiller <julien@lepiller.eu>
2
;;;; 
3
;;;; This library is free software; you can redistribute it and/or
4
;;;; modify it under the terms of the GNU Lesser General Public
5
;;;; License as published by the Free Software Foundation; either
6
;;;; version 3 of the License, or (at your option) any later version.
7
;;;; 
8
;;;; This library is distributed in the hope that it will be useful,
9
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
10
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11
;;;; Lesser General Public License for more details.
12
;;;; 
13
;;;; You should have received a copy of the GNU Lesser General Public
14
;;;; License along with this library; if not, write to the Free Software
15
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
16
;;;; 
17
18
(define-module (turtle parser)
19
  #:use-module (ice-9 peg)
20
  #:export (parse-turtle))
21
22
;; Productions for terminals
23
;; [18] 	IRIREF 	::= 	'<' ([^#x00-#x20<>"{}|^`\] | UCHAR)* '>' /* #x00=NULL #01-#x1F=control codes #x20=space */
24
(define-peg-pattern iriref all
25
  (and (ignore "<")
26
       (* (or "!" (range #\x23 #\x3b) "=" (range #\x3f #\x5b) "]" "_"
27
              (range #\x61 #\x7a) (range #\x7e #\x10ffff) uchar))
28
       (ignore ">")))
29
;; [139s] 	PNAME_NS 	::= 	PN_PREFIX? ':'
30
(define-peg-pattern pname-ns all (and (? pn-prefix) (ignore ":")))
31
;; [140s] 	PNAME_LN 	::= 	PNAME_NS PN_LOCAL
32
(define-peg-pattern pname-ln all (and pname-ns pn-local))
33
;; [141s] 	BLANK_NODE_LABEL 	::= 	'_:' (PN_CHARS_U | [0-9]) ((PN_CHARS | '.')* PN_CHARS)?
34
(define-peg-pattern blank-node-label all
35
  (and "_:" (or pn-chars-u (range #\0 #\9)) (* (and (* ".") pn-chars))))
36
;; [144s] 	LANGTAG 	::= 	'@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)*
37
(define-peg-pattern langtag all
38
  (and "@" (+ (or (range #\a #\z) (range #\A #\Z)))
39
       (* (and "-" (or (range #\a #\z) (range #\A #\Z) (range #\0 #\9))))))
40
;; [19] 	INTEGER 	::= 	[+-]? [0-9]+
41
(define-peg-pattern integer all (and (? (or "+" "-")) (+ (range #\0 #\9))))
42
;; [20] 	DECIMAL 	::= 	[+-]? [0-9]* '.' [0-9]+
43
(define-peg-pattern decimal all
44
  (and (? (or "+" "-")) (* (range #\0 #\9)) "." (+ (range #\0 #\9))))
45
;; [21] 	DOUBLE 	::= 	[+-]? ([0-9]+ '.' [0-9]* EXPONENT | '.' [0-9]+ EXPONENT | [0-9]+ EXPONENT)
46
(define-peg-pattern double all
47
  (and (? (or "+" "-"))
48
       (or (and (+ (range #\0 #\9)) "." (* (range #\0 #\9)) exponent)
49
           (and "." (+ (range #\0 #\9)) exponent)
50
           (and (+ (range #\0 #\9)) exponent))))
51
;; [154s] 	EXPONENT 	::= 	[eE] [+-]? [0-9]+
52
(define-peg-pattern exponent body
53
  (and (or "e" "E") (? (or "+" "-")) (+ (range #\0 #\9))))
54
;; [22] 	STRING_LITERAL_QUOTE 	::= 	'"' ([^#x22#x5C#xA#xD] | ECHAR | UCHAR)* '"' /* #x22=" #x5C=\ #xA=new line #xD=carriage return */
55
(define-peg-pattern string-literal-quote all
56
  (and (ignore "\"")
57
       (* (or (range #\x00 #\x09) (range #\x0b #\x0c) (range #\x0d #\x21)
58
              (range #\x23 #\x5b) (range #\x5d #\x10ffff) echar uchar))
59
       (ignore "\"")))
60
;; [23] 	STRING_LITERAL_SINGLE_QUOTE 	::= 	"'" ([^#x27#x5C#xA#xD] | ECHAR | UCHAR)* "'" /* #x27=' #x5C=\ #xA=new line #xD=carriage return */
61
(define-peg-pattern string-literal-single-quote all
62
  (and (ignore "'")
63
       (* (or (range #\x00 #\x09) (range #\x0b #\x0c) (range #\x0d #\x26)
64
              (range #\x28 #\x5b) (range #\x5d #\x10ffff) echar uchar))
65
       (ignore "'")))
66
;; [24] 	STRING_LITERAL_LONG_SINGLE_QUOTE 	::= 	"'''" (("'" | "''")? ([^'\] | ECHAR | UCHAR))* "'''"
67
(define-peg-pattern string-literal-long-single-quote all
68
  (and (ignore "'''")
69
       (* (and (? (or "''" "'"))
70
               (or (range #\x00 #\x26) (range #\x28 #\x5b)
71
                   (range #\x5d #\x10ffff) echar uchar)))
72
       (ignore "'''")))
73
;; [25] 	STRING_LITERAL_LONG_QUOTE 	::= 	'"""' (('"' | '""')? ([^"\] | ECHAR | UCHAR))* '"""'
74
(define-peg-pattern string-literal-long-quote all
75
  (and (ignore "\"\"\"")
76
       (* (and (? (or "\"\"" "\""))
77
               (or (range #\x00 #\x21) (range #\x23 #\x5b)
78
                   (range #\x5d #\x10ffff) echar uchar)))
79
       (ignore "\"\"\"")))
80
;; [26] 	UCHAR 	::= 	'\u' HEX HEX HEX HEX | '\U' HEX HEX HEX HEX HEX HEX HEX HEX
81
(define-peg-pattern uchar body
82
  (or (and "\\u" hex hex hex hex)
83
      (and "\\U" hex hex hex hex hex hex hex hex)))
84
;; [159s] 	ECHAR 	::= 	'\' [tbnrf"'\]
85
(define-peg-pattern echar body
86
  (or "\\t" "\\b" "\\n" "\\r" "\\f" "\\\"" "\\'" "\\\\"))
87
;; [161s] 	WS 	::= 	#x20 | #x9 | #xD | #xA /* #x20=space #x9=character tabulation #xD=carriage return #xA=new line */
88
(define-peg-pattern ws body (or " " "\t" "\r" "\n"))
89
;; [162s] 	ANON 	::= 	'[' WS* ']'
90
(define-peg-pattern anon all (and "[" (* ws) "]"))
91
;; [163s] 	PN_CHARS_BASE 	::= 	[A-Z] | [a-z] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x02FF] | [#x0370-#x037D] | [#x037F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
92
(define-peg-pattern pn-chars-base body
93
  (or (range #\A #\Z) (range #\a #\z) (range #\x00c0 #\x00d6)
94
      (range #\x00d8 #\x00f6) (range #\x00f8 #\x02ff) (range #\x0370 #\x037d)
95
      (range #\x037f #\x1fff) (range #\x200c #\x200d) (range #\x2070 #\x218f)
96
      (range #\x2c00 #\x2fef) (range #\x3001 #\xd7ff) (range #\xf900 #\xfdcf)
97
      (range #\xfdf0 #\xfffd) (range #\x10000 #\xeffff)))
98
;; [164s] 	PN_CHARS_U 	::= 	PN_CHARS_BASE | '_'
99
(define-peg-pattern pn-chars-u body (or pn-chars-base "_"))
100
;; [166s] 	PN_CHARS 	::= 	PN_CHARS_U | '-' | [0-9] | #x00B7 | [#x0300-#x036F] | [#x203F-#x2040]
101
(define-peg-pattern pn-chars body
102
  (or pn-chars-u "-" (range #\0 #\9) "ยท" (range #\x0300 #\x036f)
103
      (range #\x203f #\x2040)))
104
;; [167s] 	PN_PREFIX 	::= 	PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)?
105
(define-peg-pattern pn-prefix body
106
  (and pn-chars-base (* (and (* ".") pn-chars))))
107
;; [168s] 	PN_LOCAL 	::= 	(PN_CHARS_U | ':' | [0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))?
108
(define-peg-pattern pn-local body
109
  (and (or pn-chars-u ":" (range #\0 #\9) plx)
110
       (* (and (* ".") (or pn-chars ":" plx)))))
111
;; [169s] 	PLX 	::= 	PERCENT | PN_LOCAL_ESC
112
(define-peg-pattern plx body (or percent pn-local-esc))
113
;; [170s] 	PERCENT 	::= 	'%' HEX HEX
114
(define-peg-pattern percent body (and "%" hex hex))
115
;; [171s] 	HEX 	::= 	[0-9] | [A-F] | [a-f]
116
(define-peg-pattern hex body (or (range #\0 #\9) (range #\a #\f) (range #\A #\F)))
117
;; [172s] 	PN_LOCAL_ESC 	::= 	'\' ('_' | '~' | '.' | '-' | '!' | '$' | '&' | "'" | '(' | ')' | '*' | '+' | ',' | ';' | '=' | '/' | '?' | '#' | '@' | '%')
118
(define-peg-pattern pn-local-esc body
119
  (and "\\" (or "~" "." "-" "!" "$" "&" "'" "(" ")" "*" "+" "," ";" "=" "/"
120
                "?" "#" "@" "%" "_")))
121
122
(define-peg-pattern comment body (and "#" (* (or (range #\x00 #\x09)
123
                                                 (range #\x0B #\x0C)
124
                                                 (range #\x0E #\x10FFFF)))))
125
(define-peg-pattern WS none (* (or comment ws)))
126
(define-peg-pattern unrecognized body (range #\x00 #\x10ffff))
127
128
129
;; [1] 	turtleDoc 	::= 	statement*
130
(define-peg-pattern turtle-doc body (and WS (* (and statement WS))))
131
;; [2] 	statement 	::= 	directive | triples '.'
132
(define-peg-pattern statement body
133
  (or directive (and triples WS (ignore ".")) (* unrecognized)))
134
;; [3] 	directive 	::= 	prefixID | base | sparqlPrefix | sparqlBase
135
(define-peg-pattern directive body (or prefix-id base sparql-prefix sparql-base))
136
;; [4] 	prefixID 	::= 	'@prefix' PNAME_NS IRIREF '.'
137
(define-peg-pattern prefix-id all
138
  (and (ignore "@prefix") WS pname-ns WS iriref WS (ignore ".")))
139
;; [5] 	base 	::= 	'@base' IRIREF '.'
140
(define-peg-pattern base all
141
  (and (ignore "@base") WS iriref WS (ignore ".")))
142
;; [5s] 	sparqlBase 	::= 	"BASE" IRIREF
143
(define-peg-pattern sparql-base all
144
  (and (ignore (and (or "b" "B") (or "a" "A") (or "s" "S") (or "e" "E")))
145
       WS iriref))
146
;; [6s] 	sparqlPrefix 	::= 	"PREFIX" PNAME_NS IRIREF
147
(define-peg-pattern sparql-prefix all
148
      (and (ignore (and (or "p" "P") (or "r" "R") (or "e" "E") (or "f" "F")
149
                        (or "i" "I") (or "x" "X")))
150
           WS pname-ns WS iriref))
151
;; [6] 	triples 	::= 	subject predicateObjectList | blankNodePropertyList predicateObjectList?
152
(define-peg-pattern triples all
153
  (or (and subject WS predicate-object-list)
154
      (and blank-node-property-list WS (? predicate-object-list))))
155
;; [7] 	predicateObjectList 	::= 	verb objectList (';' (verb objectList)?)*
156
(define-peg-pattern predicate-object-list all
157
  (and verb WS object-list
158
       (* (and WS (ignore ";") WS (? (and verb WS object-list))))))
159
;; [8] 	objectList 	::= 	object (',' object)*
160
(define-peg-pattern object-list all
161
  (and object (* (and WS (ignore ",") WS object))))
162
;; [9] 	verb 	::= 	predicate | 'a'
163
(define-peg-pattern verb all (or predicate "a"))
164
;; [10] 	subject 	::= 	iri | BlankNode | collection
165
(define-peg-pattern subject all (or iri blank-node collection))
166
;; [11] 	predicate 	::= 	iri
167
(define-peg-pattern predicate all iri)
168
;; [12] 	object 	::= 	iri | BlankNode | collection | blankNodePropertyList | literal
169
(define-peg-pattern object all
170
  (or iri blank-node collection blank-node-property-list literal))
171
;; [13] 	literal 	::= 	RDFLiteral | NumericLiteral | BooleanLiteral
172
(define-peg-pattern literal body
173
  (or rdf-literal numeric-literal boolean-literal))
174
;; [14] 	blankNodePropertyList 	::= 	'[' predicateObjectList ']'
175
(define-peg-pattern blank-node-property-list all
176
  (and (ignore "[") WS predicate-object-list WS (ignore "]")))
177
;; [15] 	collection 	::= 	'(' object* ')'
178
(define-peg-pattern collection all
179
  (and (ignore "(") WS (* (and object WS)) (ignore ")")))
180
;; [16] 	NumericLiteral 	::= 	INTEGER | DECIMAL | DOUBLE
181
(define-peg-pattern numeric-literal all (or integer decimal double))
182
;; [128s] 	RDFLiteral 	::= 	String (LANGTAG | '^^' iri)?
183
(define-peg-pattern rdf-literal all
184
  (and string-pat WS (? (or langtag (and "^^" WS iri)))))
185
;; [133s] 	BooleanLiteral 	::= 	'true' | 'false'
186
(define-peg-pattern boolean-literal all (or "true" "false"))
187
;; [17] 	String 	::= 	STRING_LITERAL_QUOTE | STRING_LITERAL_SINGLE_QUOTE | STRING_LITERAL_LONG_SINGLE_QUOTE | STRING_LITERAL_LONG_QUOTE
188
(define-peg-pattern string-pat all
189
  (or string-literal-long-single-quote string-literal-long-quote
190
      string-literal-quote string-literal-single-quote))
191
;; [135s] 	iri 	::= 	IRIREF | PrefixedName
192
(define-peg-pattern iri all (or iriref prefixed-name))
193
;; [136s] 	PrefixedName 	::= 	PNAME_LN | PNAME_NS
194
(define-peg-pattern prefixed-name all (or pname-ln pname-ns))
195
;; [137s] 	BlankNode 	::= 	BLANK_NODE_LABEL | ANON
196
(define-peg-pattern blank-node all (or blank-node-label anon))
197
198
199
200
(define (parse-turtle str)
201
  (peg:tree (match-pattern turtle-doc str)))
202