|
16 | 16 | ; SPDX-License-Identifier: Apache-2.0
|
17 | 17 | ;
|
18 | 18 |
|
19 |
| -(ns lice-comb.impl.matching |
20 |
| - "Matching helper functionality. Note: this namespace is not part of |
21 |
| - the public API of lice-comb and may change without notice." |
| 19 | +(ns lice-comb.impl.parsing |
| 20 | + "License name, URI, and text parsing functionality. Note: this namespace is |
| 21 | + not part of the public API of lice-comb and may change without notice." |
22 | 22 | (:require [clojure.string :as s]
|
23 | 23 | [clojure.set :as set]
|
24 | 24 | [clojure.java.io :as io]
|
|
28 | 28 | [spdx.expressions :as sexp]
|
29 | 29 | [embroidery.api :as e]
|
30 | 30 | [lice-comb.impl.spdx :as lcis]
|
31 |
| - [lice-comb.impl.regex-matching :as lcirm] |
| 31 | + [lice-comb.impl.id-detection :as lciid] |
| 32 | + [lice-comb.impl.splitting :as lcisp] |
32 | 33 | [lice-comb.impl.expressions-info :as lciei]
|
33 |
| - [lice-comb.impl.3rd-party :as lc3] |
34 | 34 | [lice-comb.impl.http :as lcihttp]
|
35 | 35 | [lice-comb.impl.data :as lcid]
|
36 | 36 | [lice-comb.impl.utils :as lciu]))
|
|
121 | 121 | fix-mpl-2
|
122 | 122 | fix-license-id-with-exception-id))
|
123 | 123 |
|
124 |
| -(defmulti text->expressions-info |
| 124 | +(defmulti match-text |
125 | 125 | "Returns an expressions-info map for the given license text, or nil if no
|
126 | 126 | matches are found."
|
127 | 127 | {:arglists '([text])}
|
128 | 128 | class)
|
129 | 129 |
|
130 |
| -(defmethod text->expressions-info java.lang.String |
| 130 | +(defmethod match-text java.lang.String |
131 | 131 | [s]
|
132 | 132 | ; clj-spdx's *-within-text APIs are *expensive* but support batching, so we check batches of ids in parallel
|
133 | 133 | (let [num-cpus (.availableProcessors (Runtime/getRuntime))
|
|
143 | 143 | ; Note: we don't need to sexp/normalise the keys here, as the only expressions that can be returned are constructed correctly
|
144 | 144 | (manual-fixes (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-matching-guidelines})) expressions-found))))))
|
145 | 145 |
|
146 |
| -(defmethod text->expressions-info java.io.Reader |
| 146 | +(defmethod match-text java.io.Reader |
147 | 147 | [r]
|
148 | 148 | (let [sw (java.io.StringWriter.)]
|
149 | 149 | (io/copy r sw)
|
150 |
| - (text->expressions-info (str sw)))) |
| 150 | + (match-text (str sw)))) |
151 | 151 |
|
152 |
| -(defmethod text->expressions-info java.io.InputStream |
| 152 | +(defmethod match-text java.io.InputStream |
153 | 153 | [is]
|
154 |
| - (text->expressions-info (io/reader is))) |
| 154 | + (match-text (io/reader is))) |
155 | 155 |
|
156 |
| -(defmethod text->expressions-info :default |
| 156 | +(defmethod match-text :default |
157 | 157 | [src]
|
158 | 158 | (when src
|
159 | 159 | (with-open [r (io/reader src)]
|
160 |
| - (doall (text->expressions-info r))))) |
| 160 | + (doall (match-text r))))) |
161 | 161 |
|
162 |
| -(defn uri->expressions-info |
163 |
| - "Returns an expressions-info map for the given license uri, or nil if no |
164 |
| - matches are found." |
| 162 | +(defn parse-uri |
| 163 | + "Parses the given license `uri`, returning an expressions-info map, or `nil` |
| 164 | + if no matching license ids were found." |
165 | 165 | [uri]
|
166 | 166 | (when-not (s/blank? uri)
|
167 | 167 | (let [result (manual-fixes
|
168 |
| - (let [suri (lciu/simplify-uri uri)] |
169 |
| - (or ; 1. Does the simplified URI match any of the simplified URIs in the SPDX license or exception lists? |
170 |
| - (when-let [ids (get @lcis/index-uri-to-id-d suri)] |
171 |
| - (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids))) |
172 |
| - |
173 |
| - ; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it |
174 |
| - (when-let [license-text (lcihttp/get-text uri)] |
175 |
| - (text->expressions-info license-text)))))] |
| 168 | + (or |
| 169 | + ; 1. Is the URI a close match for any of the URIs in the SPDX license or exception lists? |
| 170 | + (when-let [ids (lcis/near-match-uri uri)] |
| 171 | + (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids))) |
| 172 | + |
| 173 | + ; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it |
| 174 | + (when-let [license-text (lcihttp/get-text uri)] |
| 175 | + (match-text license-text))))] |
176 | 176 | ; We don't need to sexp/normalise the keys here, as we never detect an expression from a URI
|
177 | 177 | (lciei/prepend-source uri result))))
|
178 | 178 |
|
|
194 | 194 | (map #(apply hash-map %) cursed-name))
|
195 | 195 |
|
196 | 196 | ; 2. Is it an SPDX license or exception id?
|
197 |
| - (when-let [id (get @lcis/spdx-ids-d (s/lower-case s))] |
| 197 | + (when-let [id (lcis/near-match-id s)] |
198 | 198 | (if (= id s)
|
199 | 199 | (list {id (list {:id id :type :declared :strategy :spdx-listed-identifier-exact-match :source (list s)})})
|
200 | 200 | (list {id (list {:id id :type :concluded :confidence :high :strategy :spdx-listed-identifier-case-insensitive-match :source (list s)})})))
|
201 | 201 |
|
202 | 202 | ; 3. Is it the name of one or more SPDX licenses or exceptions?
|
203 |
| - (when-let [ids (get @lcis/index-name-to-id-d (s/lower-case s))] |
| 203 | + (when-let [ids (lcis/near-match-name s)] |
204 | 204 | (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-name :source (list s)})) ids))
|
205 | 205 |
|
206 | 206 | ; 4. Might it be a URI? (this is to handle some dumb corner cases that exist in pom.xml files hosted on Clojars & Maven Central)
|
207 |
| - (when-let [ids (uri->expressions-info s)] |
| 207 | + (when-let [ids (parse-uri s)] |
208 | 208 | (map #(hash-map (key %) (val %)) ids))
|
209 | 209 |
|
210 |
| - ; 5. Attempt regex name matching |
211 |
| - (lcirm/matches s) |
| 210 | + ; 5. Attempt to parse ids from the name |
| 211 | + (lciid/parse-ids s) |
212 | 212 |
|
213 |
| - ; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value instead of an SPDX license or exception identifier |
| 213 | + ; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value (NOT A LICENSEREF!) instead of an SPDX license or exception identifier |
214 | 214 | (let [id (str "UNIDENTIFIED-" s)]
|
215 | 215 | (list {id (list {:id id :type :concluded :confidence :low :confidence-explanations [:unidentified] :strategy :unidentified :source (list s)})})))]
|
216 | 216 | (map (partial lciei/prepend-source s) ids))))
|
217 | 217 |
|
218 |
| -(defn- filter-blanks |
219 |
| - "Filter blank strings out of coll" |
220 |
| - [coll] |
221 |
| - (when (seq coll) |
222 |
| - (seq (filter #(or (not (string? %)) (not (s/blank? %))) coll)))) |
223 |
| - |
224 |
| -(defn- map-split-and-interpose |
225 |
| - "Maps over the given sequence, splitting strings using the given regex re and |
226 |
| - interposing the given value inter, returning a (flattened) sequence." |
227 |
| - [re inter coll] |
228 |
| - (mapcat #(if-not (string? %) |
229 |
| - [%] |
230 |
| - (let [splits (s/split % re)] |
231 |
| - (if (nil? inter) |
232 |
| - splits |
233 |
| - (interpose inter splits)))) |
234 |
| - coll)) |
235 |
| - |
236 |
| -(defn split-on-operators |
237 |
| - "Case insensitively splits a string based on license operators (and, |
238 |
| - or, with), but only if they're not also part of a license name (e.g. |
239 |
| - 'Common Development and Distribution License', 'GNU General Public |
240 |
| - License version 2.0 or (at your option) any later version', etc.)." |
241 |
| - [s] |
242 |
| - (when-not (s/blank? s) |
243 |
| - (->> (s/split (s/trim s) #"(?i)\band[/-\\]+or\b") |
244 |
| - (map-split-and-interpose #"(?i)(\band\b|\&)(?!\s+(distribution|all\s+rights\s+reserved))" |
245 |
| - :and) |
246 |
| - (map-split-and-interpose #"(?i)\bor\b(?!\s*(-?(greater|(any\s+)?later|(any\s+)?lator|(any\s+)?newer|lesser|library|\(?at\s+your\s+(option|discretion)\)?|([\"']?(Revised|Modified)[\"']?))))" |
247 |
| - :or) |
248 |
| - (map-split-and-interpose #"(?i)\b(with\b|w/)(?!\s+the\s+acknowledgment\s+clause\s+removed)" |
249 |
| - :with) |
250 |
| - (map-split-and-interpose #"(?i)(?<=CDDL)/(?=GPL)" ; Special case for splitting particularly cursed combos such as CDDL/GPLv2+CE |
251 |
| - nil) |
252 |
| - filter-blanks |
253 |
| - (map #(if (string? %) (s/trim %) %))))) |
254 |
| - |
255 | 218 | (defn- fix-unidentified
|
256 | 219 | "Fixes a singleton UNIDENTIFIED- expression info map by converting the id to
|
257 | 220 | either a lice-comb unidentified LicenseRef or AdditionRef, depending on prev.
|
|
341 | 304 | (recur (process-expression-element result f) (first r) (rest r))
|
342 | 305 | (manual-fixes (into {} result)))))
|
343 | 306 |
|
344 |
| -(defn name->expressions-info |
345 |
| - "Returns an expressions-info map for the given license name." |
| 307 | +(defn parse-name |
| 308 | + "Parses the given license `n`ame, returning an expressions-info map." |
346 | 309 | [n]
|
347 | 310 | (when-not (s/blank? n)
|
348 | 311 | (let [n (s/trim n)
|
349 | 312 | partial-result (some->> n
|
350 |
| - split-on-operators ; Split on operators |
351 |
| - (drop-while keyword?) ; Drop (nonsensical) leading operators |
352 |
| - (lc3/rdrop-while keyword?) ; Drop (nonsensical) trailing operators |
353 |
| - dedupe ; Deduplicate consecutive identical values (mostly applies to duplicate operators, which are redundant) |
| 313 | + lcisp/split-on-operators ; Split on operators |
354 | 314 | (map #(if (keyword? %) % (string->ids-info %))) ; Determine SPDX ids (or UNIDENTIFIED-xxx) with info for all non-operators
|
355 | 315 | flatten ; Flatten back to an unnested sequence (since string->ids-info returns sequences)
|
356 | 316 | fix-unidentifieds ; Convert each unidentified non-operator into either a LicenseRef or AdditionRef, depending on context
|
|
374 | 334 | Note: this method has a substantial performance cost."
|
375 | 335 | []
|
376 | 336 | (lcis/init!)
|
377 |
| - (lcirm/init!) |
| 337 | + (lciid/init!) |
378 | 338 | (lcihttp/init!)
|
379 | 339 | @cursed-names-d
|
380 | 340 | nil)
|
0 commit comments