download.scm
1 | ;;;; Copyright (C) 2019, 2020 Julien Lepiller <julien@lepiller.eu> |
2 | ;;;; |
3 | ;;;; This library is free software; you can redistribute it and/or |
4 | ;;;; modify it under the terms of the GNU Lesser General Public |
5 | ;;;; License as published by the Free Software Foundation; either |
6 | ;;;; version 3 of the License, or (at your option) any later version. |
7 | ;;;; |
8 | ;;;; This library is distributed in the hope that it will be useful, |
9 | ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
11 | ;;;; Lesser General Public License for more details. |
12 | ;;;; |
13 | ;;;; You should have received a copy of the GNU Lesser General Public |
14 | ;;;; License along with this library; if not, write to the Free Software |
15 | ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
16 | ;;;; |
17 | |
18 | (define-module (jsonld download) |
19 | #:use-module (ice-9 match) |
20 | #:use-module (json) |
21 | #:use-module (jsonld iri) |
22 | #:use-module (jsonld json) |
23 | #:use-module (web client) |
24 | #:use-module (web response) |
25 | #:use-module (web uri) |
26 | #:use-module (rnrs bytevectors) |
27 | #:use-module (ice-9 match) |
28 | #:use-module (srfi srfi-1) |
29 | #:use-module (srfi srfi-9) |
30 | #:use-module (sxml simple) |
31 | #:export (json-document |
32 | make-json-document |
33 | new-json-document |
34 | json-document? |
35 | json-document-context-url |
36 | json-document-document-url |
37 | json-document-document |
38 | json-document-content-type |
39 | json-document-profile |
40 | |
41 | download-json)) |
42 | |
43 | (define-record-type json-document |
44 | (make-json-document context-url document-url document content-type profile) |
45 | json-document? |
46 | (context-url json-document-context-url) |
47 | (document-url json-document-document-url) |
48 | (document json-document-document) |
49 | (content-type json-document-content-type) |
50 | (profile json-document-profile)) |
51 | |
52 | (define* (new-json-document #:key context-url document-url document content-type |
53 | profile) |
54 | (make-json-document context-url document-url document content-type profile)) |
55 | |
56 | (define-record-type link-header |
57 | (make-link-header uri params) |
58 | link-header? |
59 | (uri link-header-uri) |
60 | (params link-header-params)) |
61 | |
62 | (define-record-type content-type-header |
63 | (make-content-type-header type params) |
64 | content-type-header? |
65 | (type content-type-header-type) |
66 | (params content-type-header-params)) |
67 | |
68 | (define (parse-link-header link) |
69 | (let* ((header (string-split link #\;)) |
70 | (uri (car header)) |
71 | (uri (substring uri 1 (- (string-length uri) 1))) |
72 | (params (cdr header)) |
73 | (params (map |
74 | (lambda (param) |
75 | (match (map |
76 | (lambda (l) (string-trim l #\space)) |
77 | (string-split param #\=)) |
78 | ((type val) |
79 | (cons type (substring val 1 (- (string-length val) 1)))))) |
80 | params))) |
81 | (make-link-header uri params))) |
82 | |
83 | (define (parse-type type) |
84 | (let* ((mime-type (string-split type #\;)) |
85 | (type (car mime-type)) |
86 | (params (map |
87 | (lambda (param) |
88 | (match (string-split param #\=) |
89 | ((type val ...) |
90 | (cons (string-trim type #\space) |
91 | (string-trim (string-join val "=") #\space))))) |
92 | (cdr mime-type)))) |
93 | (make-content-type-header type params))) |
94 | |
95 | (define (html-document-base base document) |
96 | (define (find-base document) |
97 | (match document |
98 | (('base ('@ attr ...)) |
99 | (let loop ((attr attr)) |
100 | (if (null? attr) |
101 | #f |
102 | (match (car attr) |
103 | (('href base) base) |
104 | (_ (loop (cdr attr))))))) |
105 | ('() #f) |
106 | ((tag ('@ opt ...) rest ...) |
107 | (fold (lambda (element result) |
108 | (or result (find-base element))) |
109 | #f rest)) |
110 | ((tag rest ...) |
111 | (fold (lambda (element result) |
112 | (or result (find-base element))) |
113 | #f rest)) |
114 | (_ #f))) |
115 | |
116 | (let ((new-base (find-base document))) |
117 | (if new-base |
118 | (resolve-iri base new-base) |
119 | base))) |
120 | |
121 | (define (get-script-elements document) |
122 | (define (find-scripts document) |
123 | (match document |
124 | (('script t ...) |
125 | (list document)) |
126 | ('() '()) |
127 | ((tag ('@ opt ...) rest ...) |
128 | (append-map get-script-elements rest)) |
129 | ((tag rest ...) |
130 | (append-map get-script-elements rest)) |
131 | (_ '()))) |
132 | |
133 | (define (jsonld-script? element) |
134 | (match element |
135 | (('script ('@ opt ...) content) |
136 | (let* ((types (filter (lambda (opt) (equal? (car opt) 'type)) opt)) |
137 | (type (if (null? types) #f (cadr (car types))))) |
138 | (and type |
139 | (equal? (content-type-header-type (parse-type type)) |
140 | "application/ld+json")))) |
141 | (_ #f))) |
142 | |
143 | (filter jsonld-script? (find-scripts document))) |
144 | |
145 | (define (get-script-content-by-id document id) |
146 | (define (has-correct-id? element) |
147 | (match element |
148 | ((tag ('@ opts ...) content) |
149 | (let* ((ids (filter (lambda (o) (equal? (car o) 'id)) opts)) |
150 | (element-id (if (null? ids) #f (cadr (car ids))))) |
151 | (equal? element-id id))))) |
152 | |
153 | (let ((id-scripts (filter has-correct-id? (get-script-elements document)))) |
154 | (when (null? id-scripts) |
155 | (throw 'loading-document-failed)) |
156 | (get-script-content (car id-scripts)))) |
157 | |
158 | (define (get-script-content-by-profile document profile) |
159 | (define (has-correct-profile? element) |
160 | (match element |
161 | ((tag ('@ opts ...) content) |
162 | (let* ((types (filter (lambda (o) (equal? (car o) 'type)) opts)) |
163 | (type (if (null? types) #f (cadr (car types))))) |
164 | (and type |
165 | (equal? |
166 | (assoc-ref |
167 | (content-type-header-params (parse-type type)) |
168 | "profile") |
169 | profile)))))) |
170 | |
171 | (let ((profile-scripts (filter has-correct-profile? |
172 | (get-script-elements document)))) |
173 | (when (null? profile-scripts) |
174 | (throw 'loading-document-failed)) |
175 | (get-script-content (car profile-scripts)))) |
176 | |
177 | (define (get-first-script-content document) |
178 | (let ((scripts (get-script-elements document))) |
179 | (when (null? scripts) |
180 | (throw 'loading-document-failed)) |
181 | (get-script-content (car scripts)))) |
182 | |
183 | (define (get-script-content element) |
184 | (match element |
185 | (('script ('@ opts ...) content) |
186 | content))) |
187 | |
188 | (define (reverse-document document) |
189 | "Reverse the order of elements in @var{document}, a Json object, array or |
190 | scalar. This recursively reverses the order of any array or key-pair |
191 | association." |
192 | (match document |
193 | ((? json-array? document) |
194 | (list->array 1 (map reverse-document (array->list document)))) |
195 | ((? json-object? document) |
196 | (fold (lambda (pair result) (cons (cons (car pair) (reverse-document (cdr pair))) result)) '() document)) |
197 | (_ document))) |
198 | |
199 | (define (links-with-rel links rel) |
200 | (filter |
201 | (lambda (link) |
202 | (equal? |
203 | (assoc-ref (link-header-params link) "rel") |
204 | rel)) |
205 | links)) |
206 | |
207 | (define (html->sxml document) |
208 | "Convert document (a string) into an sxml representation that preserves entities" |
209 | (xml->sxml document |
210 | #:entities |
211 | '((lt . "&lt;") |
212 | (gt . "&gt;") |
213 | ;(amp . "&amp;") |
214 | (apos . "&apos;")) |
215 | #:default-entity-handler |
216 | (lambda (port name) |
217 | (string-append "&" (symbol->string name) ";")))) |
218 | |
219 | (define* (download-json uri #:key (mime-type "application/ld+json") |
220 | (extract-all-scripts? #f) |
221 | (profile #f) (request-profile #f) |
222 | ;; Used for tests |
223 | (http-get http-get)) |
224 | "Download a JsonLD document from @var{uri}, using the HTTP protocol. The |
225 | Accept HTTP header can be modified with @var{mime-type}." |
226 | (define complete-mime-type |
227 | (string-append |
228 | (if request-profile |
229 | (string-append mime-type ";profile=" request-profile) |
230 | mime-type) |
231 | (if (equal? request-profile "http://www.w3.org/ns/json-ld#context") |
232 | "" |
233 | ", text/html;q=0.8; application/xhtml+xml;q=0.5"))) |
234 | |
235 | (catch #t |
236 | (lambda () |
237 | (call-with-values |
238 | (lambda () |
239 | (http-get uri #:headers `((Accept . ,complete-mime-type)))) |
240 | (lambda (hdr body) |
241 | (cond |
242 | ((member (response-code hdr) '(301 302 303 307)) |
243 | (download-json (uri->string (response-location hdr)) |
244 | #:mime-type mime-type |
245 | #:extract-all-scripts? extract-all-scripts? |
246 | #:profile profile |
247 | #:request-profile request-profile)) |
248 | ((equal? (response-code hdr) 200) |
249 | (let* ((content-type (response-content-type hdr)) |
250 | (type (symbol->string (car content-type))) |
251 | (link-headers |
252 | (map cdr (filter |
253 | (lambda (p) (equal? (car p) 'link)) |
254 | (response-headers hdr)))) |
255 | (links (map parse-link-header link-headers)) |
256 | (alternates (links-with-rel links "alternate")) |
257 | (alternates (filter |
258 | (lambda (alternate) |
259 | (equal? |
260 | (assoc-ref (link-header-params alternate) "type") |
261 | "application/ld+json")) |
262 | alternates)) |
263 | (contexts (links-with-rel links "http://www.w3.org/ns/json-ld#context")) |
264 | (context-url #f) |
265 | (document #f) |
266 | (base (if (string? uri) uri (uri->string uri)))) |
267 | (if (and (not (equal? type "application/json")) |
268 | (or |
269 | (< (string-length type) 5) |
270 | (not (equal? |
271 | (substring type |
272 | (- (string-length type) 5)) |
273 | "+json"))) |
274 | (not (null? alternates))) |
275 | (download-json (resolve-iri base (link-header-uri (car alternates))) |
276 | #:mime-type mime-type |
277 | #:extract-all-scripts? extract-all-scripts? |
278 | #:profile profile |
279 | #:request-profile request-profile) |
280 | (begin |
281 | (when (and (or (equal? type "application/json") |
282 | (and (> (string-length type) 5) |
283 | (equal? (substring |
284 | type (- (string-length type) 5)) |
285 | "+json"))) |
286 | (not (equal? type "application/ld+json")) |
287 | (not (null? contexts))) |
288 | (set! context-url |
289 | (resolve-iri base (link-header-uri (car contexts)))) |
290 | (when (> (length contexts) 1) |
291 | (throw 'multiple-context-link-headers))) |
292 | (cond |
293 | ((or (equal? type "application/json") |
294 | (and (> (string-length type) 5) |
295 | (equal? (substring type |
296 | (- (string-length type) 5)) |
297 | "+json"))) |
298 | (set! document (reverse-document |
299 | (json-string->scm |
300 | (if (string? body) |
301 | body |
302 | (utf8->string body)))))) |
303 | ((or (equal? type "text/html") |
304 | (equal? type "application/xhtml+xml")) |
305 | (let* ((content (html->sxml |
306 | (if (string? body) |
307 | body |
308 | (utf8->string body)))) |
309 | (uri (string->uri base)) |
310 | (source #f)) |
311 | (set! base (html-document-base base content)) |
312 | (when (uri-fragment uri) |
313 | (set! source (get-script-content-by-id |
314 | content (uri-fragment uri)))) |
315 | (when (and (equal? source #f) profile) |
316 | (set! source (get-script-content-by-profile |
317 | content profile))) |
318 | (when (and (equal? source #f) (not extract-all-scripts?)) |
319 | (set! source (get-first-script-content content))) |
320 | (if source |
321 | (catch #t |
322 | (lambda () |
323 | (set! document (reverse-document |
324 | (json-string->scm source)))) |
325 | (lambda _ |
326 | (throw 'invalid-script-element source))) |
327 | (begin |
328 | (unless extract-all-scripts? |
329 | (throw 'loading-document-failed)) |
330 | (set! document '()) |
331 | (for-each |
332 | (lambda (el) |
333 | (catch #t |
334 | (lambda () |
335 | (let ((script-content |
336 | (reverse-document |
337 | (json-string->scm |
338 | (get-script-content el))))) |
339 | (set! document |
340 | (append document |
341 | (if (json-array? script-content) |
342 | (array->list script-content) |
343 | (list script-content)))))) |
344 | (lambda _ |
345 | (throw 'invalid-script-element el)))) |
346 | (get-script-elements content)) |
347 | (set! document (list->array 1 document)))))) |
348 | (else |
349 | (throw 'loading-remote-document-failed))) |
350 | (new-json-document |
351 | #:document-url base |
352 | #:document document |
353 | #:content-type type |
354 | #:context-url context-url))))) |
355 | (else (throw 'not-found)))))) |
356 | (lambda (key . value) |
357 | (cond |
358 | ((equal? key 'multiple-context-link-headers) |
359 | (apply throw 'multiple-context-link-headers value)) |
360 | ((equal? key 'invalid-script-element) |
361 | (apply throw 'invalid-script-element value)) |
362 | (else (apply throw 'loading-remote-document-failed key value)))))) |
363 |