download.scm
1 | ;;;; Copyright (C) 2019, 2020 Julien Lepiller <julien@lepiller.eu> |
2 | ;;;; |
3 | ;;;; This library is free software; you can redistribute it and/or |
4 | ;;;; modify it under the terms of the GNU Lesser General Public |
5 | ;;;; License as published by the Free Software Foundation; either |
6 | ;;;; version 3 of the License, or (at your option) any later version. |
7 | ;;;; |
8 | ;;;; This library is distributed in the hope that it will be useful, |
9 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
11 | ;;;; Lesser General Public License for more details. |
12 | ;;;; |
13 | ;;;; You should have received a copy of the GNU Lesser General Public |
14 | ;;;; License along with this library; if not, write to the Free Software |
15 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
16 | ;;;; |
17 | |
18 | (define-module (jsonld download) |
19 | #:use-module (ice-9 match) |
20 | #:use-module (json) |
21 | #:use-module (jsonld iri) |
22 | #:use-module (jsonld json) |
23 | #:use-module (web client) |
24 | #:use-module (web response) |
25 | #:use-module (web uri) |
26 | #:use-module (rnrs bytevectors) |
27 | #:use-module (srfi srfi-1) |
28 | #:use-module (srfi srfi-9) |
29 | #:use-module (sxml simple) |
30 | #:export (json-document |
31 | make-json-document |
32 | new-json-document |
33 | json-document? |
34 | json-document-context-url |
35 | json-document-document-url |
36 | json-document-document |
37 | json-document-content-type |
38 | json-document-profile |
39 | |
40 | download-json)) |
41 | |
42 | (define-record-type json-document |
43 | (make-json-document context-url document-url document content-type profile) |
44 | json-document? |
45 | (context-url json-document-context-url) |
46 | (document-url json-document-document-url) |
47 | (document json-document-document) |
48 | (content-type json-document-content-type) |
49 | (profile json-document-profile)) |
50 | |
51 | (define* (new-json-document #:key context-url document-url document content-type |
52 | profile) |
53 | (make-json-document context-url document-url document content-type profile)) |
54 | |
55 | (define-record-type link-header |
56 | (make-link-header uri params) |
57 | link-header? |
58 | (uri link-header-uri) |
59 | (params link-header-params)) |
60 | |
61 | (define-record-type content-type-header |
62 | (make-content-type-header type params) |
63 | content-type-header? |
64 | (type content-type-header-type) |
65 | (params content-type-header-params)) |
66 | |
67 | (define (parse-link-header link) |
68 | (let* ((header (string-split link #\;)) |
69 | (uri (car header)) |
70 | (uri (substring uri 1 (- (string-length uri) 1))) |
71 | (params (cdr header)) |
72 | (params (map |
73 | (lambda (param) |
74 | (match (map |
75 | (lambda (l) (string-trim l #\space)) |
76 | (string-split param #\=)) |
77 | ((type val) |
78 | (cons type (substring val 1 (- (string-length val) 1)))))) |
79 | params))) |
80 | (make-link-header uri params))) |
81 | |
82 | (define (parse-type type) |
83 | (let* ((mime-type (string-split type #\;)) |
84 | (type (car mime-type)) |
85 | (params (map |
86 | (lambda (param) |
87 | (match (string-split param #\=) |
88 | ((type val ...) |
89 | (cons (string-trim type #\space) |
90 | (string-trim (string-join val "=") #\space))))) |
91 | (cdr mime-type)))) |
92 | (make-content-type-header type params))) |
93 | |
94 | (define (html-document-base base document) |
95 | (define (find-base document) |
96 | (match document |
97 | (('base ('@ attr ...)) |
98 | (let loop ((attr attr)) |
99 | (if (null? attr) |
100 | #f |
101 | (match (car attr) |
102 | (('href base) base) |
103 | (_ (loop (cdr attr))))))) |
104 | ('() #f) |
105 | ((tag ('@ opt ...) rest ...) |
106 | (fold (lambda (element result) |
107 | (or result (find-base element))) |
108 | #f rest)) |
109 | ((tag rest ...) |
110 | (fold (lambda (element result) |
111 | (or result (find-base element))) |
112 | #f rest)) |
113 | (_ #f))) |
114 | |
115 | (let ((new-base (find-base document))) |
116 | (if new-base |
117 | (resolve-iri base new-base) |
118 | base))) |
119 | |
120 | (define (get-script-elements document) |
121 | (define (find-scripts document) |
122 | (match document |
123 | (('script t ...) |
124 | (list document)) |
125 | ('() '()) |
126 | ((tag ('@ opt ...) rest ...) |
127 | (append-map get-script-elements rest)) |
128 | ((tag rest ...) |
129 | (append-map get-script-elements rest)) |
130 | (_ '()))) |
131 | |
132 | (define (jsonld-script? element) |
133 | (match element |
134 | (('script ('@ opt ...) content) |
135 | (let* ((types (filter (lambda (opt) (equal? (car opt) 'type)) opt)) |
136 | (type (if (null? types) #f (cadr (car types))))) |
137 | (and type |
138 | (equal? (content-type-header-type (parse-type type)) |
139 | "application/ld+json")))) |
140 | (_ #f))) |
141 | |
142 | (filter jsonld-script? (find-scripts document))) |
143 | |
144 | (define (get-script-content-by-id document id) |
145 | (define (has-correct-id? element) |
146 | (match element |
147 | ((tag ('@ opts ...) content) |
148 | (let* ((ids (filter (lambda (o) (equal? (car o) 'id)) opts)) |
149 | (element-id (if (null? ids) #f (cadr (car ids))))) |
150 | (equal? element-id id))))) |
151 | |
152 | (let ((id-scripts (filter has-correct-id? (get-script-elements document)))) |
153 | (when (null? id-scripts) |
154 | (throw 'loading-document-failed)) |
155 | (get-script-content (car id-scripts)))) |
156 | |
157 | (define (get-script-content-by-profile document profile) |
158 | (define (has-correct-profile? element) |
159 | (match element |
160 | ((tag ('@ opts ...) content) |
161 | (let* ((types (filter (lambda (o) (equal? (car o) 'type)) opts)) |
162 | (type (if (null? types) #f (cadr (car types))))) |
163 | (and type |
164 | (equal? |
165 | (assoc-ref |
166 | (content-type-header-params (parse-type type)) |
167 | "profile") |
168 | profile)))))) |
169 | |
170 | (let ((profile-scripts (filter has-correct-profile? |
171 | (get-script-elements document)))) |
172 | (when (null? profile-scripts) |
173 | (throw 'loading-document-failed)) |
174 | (get-script-content (car profile-scripts)))) |
175 | |
176 | (define (get-first-script-content document) |
177 | (let ((scripts (get-script-elements document))) |
178 | (when (null? scripts) |
179 | (throw 'loading-document-failed)) |
180 | (get-script-content (car scripts)))) |
181 | |
182 | (define (get-script-content element) |
183 | (match element |
184 | (('script ('@ opts ...) content) |
185 | content))) |
186 | |
187 | (define (reverse-document document) |
188 | "Reverse the order of elements in @var{document}, a Json object, array or |
189 | scalar. This recursively reverses the order of any array or key-pair |
190 | association." |
191 | (match document |
192 | ((? json-array? document) |
193 | (list->array 1 (map reverse-document (array->list document)))) |
194 | ((? json-object? document) |
195 | (fold (lambda (pair result) (cons (cons (car pair) (reverse-document (cdr pair))) result)) '() document)) |
196 | (_ document))) |
197 | |
198 | (define (links-with-rel links rel) |
199 | (filter |
200 | (lambda (link) |
201 | (equal? |
202 | (assoc-ref (link-header-params link) "rel") |
203 | rel)) |
204 | links)) |
205 | |
206 | (define (html->sxml document) |
207 | "Convert document (a string) into an sxml representation that preserves entities" |
208 | (xml->sxml document |
209 | #:entities |
210 | '((lt . "&lt;") |
211 | (gt . "&gt;") |
212 | ;(amp . "&amp;") |
213 | (apos . "&apos;")) |
214 | #:default-entity-handler |
215 | (lambda (port name) |
216 | (string-append "&" (symbol->string name) ";")))) |
217 | |
218 | (define* (download-json uri #:key (mime-type "application/ld+json") |
219 | (extract-all-scripts? #f) |
220 | (profile #f) (request-profile #f) |
221 | ;; Used for tests |
222 | (http-get http-get)) |
223 | "Download a JsonLD document from @var{uri}, using the HTTP protocol. The |
224 | Accept HTTP header can be modified with @var{mime-type}." |
225 | (define complete-mime-type |
226 | (string-append |
227 | (if request-profile |
228 | (string-append mime-type ";profile=" request-profile) |
229 | mime-type) |
230 | (if (equal? request-profile "http://www.w3.org/ns/json-ld#context") |
231 | "" |
232 | ", text/html;q=0.8; application/xhtml+xml;q=0.5"))) |
233 | |
234 | (catch #t |
235 | (lambda () |
236 | (call-with-values |
237 | (lambda () |
238 | (http-get uri #:headers `((Accept . ,complete-mime-type)))) |
239 | (lambda (hdr body) |
240 | (cond |
241 | ((member (response-code hdr) '(301 302 303 307)) |
242 | (download-json (uri->string (response-location hdr)) |
243 | #:mime-type mime-type |
244 | #:extract-all-scripts? extract-all-scripts? |
245 | #:profile profile |
246 | #:request-profile request-profile)) |
247 | ((equal? (response-code hdr) 200) |
248 | (let* ((content-type (response-content-type hdr)) |
249 | (type (symbol->string (car content-type))) |
250 | (link-headers |
251 | (map cdr (filter |
252 | (lambda (p) (equal? (car p) 'link)) |
253 | (response-headers hdr)))) |
254 | (links (map parse-link-header link-headers)) |
255 | (alternates (links-with-rel links "alternate")) |
256 | (alternates (filter |
257 | (lambda (alternate) |
258 | (equal? |
259 | (assoc-ref (link-header-params alternate) "type") |
260 | "application/ld+json")) |
261 | alternates)) |
262 | (contexts (links-with-rel links "http://www.w3.org/ns/json-ld#context")) |
263 | (context-url #f) |
264 | (document #f) |
265 | (base (if (string? uri) uri (uri->string uri)))) |
266 | (if (and (not (equal? type "application/json")) |
267 | (or |
268 | (< (string-length type) 5) |
269 | (not (equal? |
270 | (substring type |
271 | (- (string-length type) 5)) |
272 | "+json"))) |
273 | (not (null? alternates))) |
274 | (download-json (resolve-iri base (link-header-uri (car alternates))) |
275 | #:mime-type mime-type |
276 | #:extract-all-scripts? extract-all-scripts? |
277 | #:profile profile |
278 | #:request-profile request-profile) |
279 | (begin |
280 | (when (and (or (equal? type "application/json") |
281 | (and (> (string-length type) 5) |
282 | (equal? (substring |
283 | type (- (string-length type) 5)) |
284 | "+json"))) |
285 | (not (equal? type "application/ld+json")) |
286 | (not (null? contexts))) |
287 | (set! context-url |
288 | (resolve-iri base (link-header-uri (car contexts)))) |
289 | (when (> (length contexts) 1) |
290 | (throw 'multiple-context-link-headers))) |
291 | (cond |
292 | ((or (equal? type "application/json") |
293 | (and (> (string-length type) 5) |
294 | (equal? (substring type |
295 | (- (string-length type) 5)) |
296 | "+json"))) |
297 | (set! document (reverse-document |
298 | (json-string->scm |
299 | (if (string? body) |
300 | body |
301 | (utf8->string body)))))) |
302 | ((or (equal? type "text/html") |
303 | (equal? type "application/xhtml+xml")) |
304 | (let* ((content (html->sxml |
305 | (if (string? body) |
306 | body |
307 | (utf8->string body)))) |
308 | (uri (string->uri base)) |
309 | (source #f)) |
310 | (set! base (html-document-base base content)) |
311 | (when (uri-fragment uri) |
312 | (set! source (get-script-content-by-id |
313 | content (uri-fragment uri)))) |
314 | (when (and (equal? source #f) profile) |
315 | (set! source (get-script-content-by-profile |
316 | content profile))) |
317 | (when (and (equal? source #f) (not extract-all-scripts?)) |
318 | (set! source (get-first-script-content content))) |
319 | (if source |
320 | (catch #t |
321 | (lambda () |
322 | (set! document (reverse-document |
323 | (json-string->scm source)))) |
324 | (lambda _ |
325 | (throw 'invalid-script-element source))) |
326 | (begin |
327 | (unless extract-all-scripts? |
328 | (throw 'loading-document-failed)) |
329 | (set! document '()) |
330 | (for-each |
331 | (lambda (el) |
332 | (catch #t |
333 | (lambda () |
334 | (let ((script-content |
335 | (reverse-document |
336 | (json-string->scm |
337 | (get-script-content el))))) |
338 | (set! document |
339 | (append document |
340 | (if (json-array? script-content) |
341 | (array->list script-content) |
342 | (list script-content)))))) |
343 | (lambda _ |
344 | (throw 'invalid-script-element el)))) |
345 | (get-script-elements content)) |
346 | (set! document (list->array 1 document)))))) |
347 | (else |
348 | (throw 'loading-remote-document-failed))) |
349 | (new-json-document |
350 | #:document-url base |
351 | #:document document |
352 | #:content-type type |
353 | #:context-url context-url))))) |
354 | (else (throw 'not-found)))))) |
355 | (lambda (key . value) |
356 | (cond |
357 | ((equal? key 'multiple-context-link-headers) |
358 | (apply throw 'multiple-context-link-headers value)) |
359 | ((equal? key 'invalid-script-element) |
360 | (apply throw 'invalid-script-element value)) |
361 | (else (apply throw 'loading-remote-document-failed key value)))))) |
362 |