Skip to content

Commit b85c65d

Browse files
committed
🚧 Improved name parsing logic (WIP)
1 parent f30c98d commit b85c65d

15 files changed

+581
-243
lines changed

src/lice_comb/impl/3rd_party.clj

-37
This file was deleted.

src/lice_comb/impl/data.clj

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
Use underscore ('_') instead.
3535
* Unlike during class loading, Clojure does not automatically switch hyphens
3636
in classpath resource path elements to underscores. This inconsistency can
37-
be a time-wasting trap."
37+
be a time-wasting foot gun."
3838
[path]
3939
(when-not (s/blank? path)
4040
(try

src/lice_comb/impl/regex_matching.clj src/lice_comb/impl/id_detection.clj

+11-10
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
; SPDX-License-Identifier: Apache-2.0
1717
;
1818

19-
(ns lice-comb.impl.regex-matching
20-
"Helper functionality focused on regex matching. Note: this namespace is not
21-
part of the public API of lice-comb and may change without notice."
19+
(ns lice-comb.impl.id-detection
20+
"Helper functionality focused on detecting SPDX id(s) from a (short) string.
21+
Note: this namespace is not part of the public API of lice-comb and may change
22+
without notice."
2223
(:require [clojure.string :as s]
2324
[clojure.set :as set]
2425
[medley.core :as med]
@@ -194,11 +195,11 @@
194195
version (get-rencgs m ["version"] (if (= variant "LGPL") "2.0" "1.0"))
195196
version (s/replace version #"\p{Punct}+" ".")
196197
[confidence confidence-explanations]
197-
(if (s/blank? version)
198-
[:low #{:missing-version}]
198+
(if version-present?
199199
(if (s/includes? version ".")
200200
[:high]
201-
[:medium #{:partial-version}]))
201+
[:medium #{:partial-version}])
202+
[:low #{:missing-version}])
202203
version (if (s/includes? version ".")
203204
version
204205
(str version ".0"))
@@ -223,7 +224,7 @@
223224

224225
; The regex for the GNU family is a nightmare, so we build it up (and test it) in pieces
225226
(def agpl-re #"(?<agpl>AGPL|Affero)(\s+GNU)?(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?AGPL\)?)?")
226-
(def lgpl-re #"(?<lgpl>(GNU\s+(Genere?al\s+)?(Library\s+or\s+Lesser|Library|Lesser))|((Library\s+or\s+Lesser|Library|Lesser)\s+(GNU|GPL|Genere?al)|(L(esser\s)?\s*GPL)))(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?L\s*GPL\)?)?")
227+
(def lgpl-re #"(?<lgpl>(GNU\s+(Genere?al\s+)?(Library\s+or\s+Lesser|Lesser\s+or\s+Library|Library|Lesser))|((Library\s+or\s+Lesser|Lesser\s+or\s+Library|Library|Lesser)\s+(GNU|GPL|Genere?al)|(L(esser\s)?\s*GPL)))(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?L\s*GPL\)?)?")
227228
(def gpl-re #"(?<!(Affero|Lesser|Library)\s+)(?<gpl>GNU(?!\s+Classpath)|(?<!(L|A)\s*)GPL|Genere?al\s+Pub?lic\s+Licen[cs]e)(?!\s+(Affero|Library|Lesser|Genere?al\s+Lesser|Genere?al\s+Library|LGPL|AGPL))((\s+General)?(?!\s+(Affero|Lesser|Library))\s+Pub?lic\s+Licen[cs]e)?(\s+\(?GPL\)?)?")
228229
(def version-re #"[\s,-]*(_?V(ersion)?)?[\s\._]*(?<version>\d+([\._]\d+)?)?")
229230
(def only-or-later-re #"[\s,-]*((?<only>\(?only\)?)|(\(?or(\s+\(?at\s+your\s+(option|discretion)\)?)?(\s+any)?)?([\s-]*(?<orLater>lat[eo]r|newer|greater|\+)))?")
@@ -370,7 +371,7 @@
370371
:fn (constantly ["Zlib" :high])}
371372
])))
372373

373-
(defn- match
374+
(defn- parse-id
374375
"If a match occured for the given regex element when tested against string s,
375376
returns a map containing the following keys:
376377
* :id The SPDX license or exception identifier that was determined
@@ -394,7 +395,7 @@
394395
:start (:start match)}
395396
(when (seq confidence-explanations) {:confidence-explanations confidence-explanations})))))
396397

397-
(defn matches
398+
(defn parse-ids
398399
"Returns a sequence (NOT A SET!) of maps where each key is a SPDX license or
399400
exception identifier (a String) that was found in s, and the value is a
400401
sequence containing a single map describing how the identifier was determined.
@@ -410,7 +411,7 @@
410411
Results are in the order in which they appear in the string, and the function
411412
returns nil if there were no matches."
412413
[s]
413-
(when-let [matches (seq (filter identity (e/pmap* (partial match s) @license-name-matching-d)))]
414+
(when-let [matches (seq (filter identity (e/pmap* (partial parse-id s) @license-name-matching-d)))]
414415
(some->> matches
415416
(med/distinct-by :id) ;####TODO: THINK ABOUT MERGING INSTEAD OF DROPPING
416417
(sort-by :start)

src/lice_comb/impl/matching.clj src/lice_comb/impl/parsing.clj

+34-74
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
; SPDX-License-Identifier: Apache-2.0
1717
;
1818

19-
(ns lice-comb.impl.matching
20-
"Matching helper functionality. Note: this namespace is not part of
21-
the public API of lice-comb and may change without notice."
19+
(ns lice-comb.impl.parsing
20+
"License name, URI, and text parsing functionality. Note: this namespace is
21+
not part of the public API of lice-comb and may change without notice."
2222
(:require [clojure.string :as s]
2323
[clojure.set :as set]
2424
[clojure.java.io :as io]
@@ -28,9 +28,9 @@
2828
[spdx.expressions :as sexp]
2929
[embroidery.api :as e]
3030
[lice-comb.impl.spdx :as lcis]
31-
[lice-comb.impl.regex-matching :as lcirm]
31+
[lice-comb.impl.id-detection :as lciid]
32+
[lice-comb.impl.splitting :as lcisp]
3233
[lice-comb.impl.expressions-info :as lciei]
33-
[lice-comb.impl.3rd-party :as lc3]
3434
[lice-comb.impl.http :as lcihttp]
3535
[lice-comb.impl.data :as lcid]
3636
[lice-comb.impl.utils :as lciu]))
@@ -121,13 +121,13 @@
121121
fix-mpl-2
122122
fix-license-id-with-exception-id))
123123

124-
(defmulti text->expressions-info
124+
(defmulti match-text
125125
"Returns an expressions-info map for the given license text, or nil if no
126126
matches are found."
127127
{:arglists '([text])}
128128
class)
129129

130-
(defmethod text->expressions-info java.lang.String
130+
(defmethod match-text java.lang.String
131131
[s]
132132
; clj-spdx's *-within-text APIs are *expensive* but support batching, so we check batches of ids in parallel
133133
(let [num-cpus (.availableProcessors (Runtime/getRuntime))
@@ -143,36 +143,36 @@
143143
; Note: we don't need to sexp/normalise the keys here, as the only expressions that can be returned are constructed correctly
144144
(manual-fixes (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-matching-guidelines})) expressions-found))))))
145145

146-
(defmethod text->expressions-info java.io.Reader
146+
(defmethod match-text java.io.Reader
147147
[r]
148148
(let [sw (java.io.StringWriter.)]
149149
(io/copy r sw)
150-
(text->expressions-info (str sw))))
150+
(match-text (str sw))))
151151

152-
(defmethod text->expressions-info java.io.InputStream
152+
(defmethod match-text java.io.InputStream
153153
[is]
154-
(text->expressions-info (io/reader is)))
154+
(match-text (io/reader is)))
155155

156-
(defmethod text->expressions-info :default
156+
(defmethod match-text :default
157157
[src]
158158
(when src
159159
(with-open [r (io/reader src)]
160-
(doall (text->expressions-info r)))))
160+
(doall (match-text r)))))
161161

162-
(defn uri->expressions-info
163-
"Returns an expressions-info map for the given license uri, or nil if no
164-
matches are found."
162+
(defn parse-uri
163+
"Parses the given license `uri`, returning an expressions-info map, or `nil`
164+
if no matching license ids were found."
165165
[uri]
166166
(when-not (s/blank? uri)
167167
(let [result (manual-fixes
168-
(let [suri (lciu/simplify-uri uri)]
169-
(or ; 1. Does the simplified URI match any of the simplified URIs in the SPDX license or exception lists?
170-
(when-let [ids (get @lcis/index-uri-to-id-d suri)]
171-
(into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids)))
172-
173-
; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it
174-
(when-let [license-text (lcihttp/get-text uri)]
175-
(text->expressions-info license-text)))))]
168+
(or
169+
; 1. Is the URI a close match for any of the URIs in the SPDX license or exception lists?
170+
(when-let [ids (lcis/near-match-uri uri)]
171+
(into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids)))
172+
173+
; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it
174+
(when-let [license-text (lcihttp/get-text uri)]
175+
(match-text license-text))))]
176176
; We don't need to sexp/normalise the keys here, as we never detect an expression from a URI
177177
(lciei/prepend-source uri result))))
178178

@@ -194,64 +194,27 @@
194194
(map #(apply hash-map %) cursed-name))
195195

196196
; 2. Is it an SPDX license or exception id?
197-
(when-let [id (get @lcis/spdx-ids-d (s/lower-case s))]
197+
(when-let [id (lcis/near-match-id s)]
198198
(if (= id s)
199199
(list {id (list {:id id :type :declared :strategy :spdx-listed-identifier-exact-match :source (list s)})})
200200
(list {id (list {:id id :type :concluded :confidence :high :strategy :spdx-listed-identifier-case-insensitive-match :source (list s)})})))
201201

202202
; 3. Is it the name of one or more SPDX licenses or exceptions?
203-
(when-let [ids (get @lcis/index-name-to-id-d (s/lower-case s))]
203+
(when-let [ids (lcis/near-match-name s)]
204204
(map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-name :source (list s)})) ids))
205205

206206
; 4. Might it be a URI? (this is to handle some dumb corner cases that exist in pom.xml files hosted on Clojars & Maven Central)
207-
(when-let [ids (uri->expressions-info s)]
207+
(when-let [ids (parse-uri s)]
208208
(map #(hash-map (key %) (val %)) ids))
209209

210-
; 5. Attempt regex name matching
211-
(lcirm/matches s)
210+
; 5. Attempt to parse ids from the name
211+
(lciid/parse-ids s)
212212

213-
; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value instead of an SPDX license or exception identifier
213+
; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value (NOT A LICENSEREF!) instead of an SPDX license or exception identifier
214214
(let [id (str "UNIDENTIFIED-" s)]
215215
(list {id (list {:id id :type :concluded :confidence :low :confidence-explanations [:unidentified] :strategy :unidentified :source (list s)})})))]
216216
(map (partial lciei/prepend-source s) ids))))
217217

218-
(defn- filter-blanks
219-
"Filter blank strings out of coll"
220-
[coll]
221-
(when (seq coll)
222-
(seq (filter #(or (not (string? %)) (not (s/blank? %))) coll))))
223-
224-
(defn- map-split-and-interpose
225-
"Maps over the given sequence, splitting strings using the given regex re and
226-
interposing the given value inter, returning a (flattened) sequence."
227-
[re inter coll]
228-
(mapcat #(if-not (string? %)
229-
[%]
230-
(let [splits (s/split % re)]
231-
(if (nil? inter)
232-
splits
233-
(interpose inter splits))))
234-
coll))
235-
236-
(defn split-on-operators
237-
"Case insensitively splits a string based on license operators (and,
238-
or, with), but only if they're not also part of a license name (e.g.
239-
'Common Development and Distribution License', 'GNU General Public
240-
License version 2.0 or (at your option) any later version', etc.)."
241-
[s]
242-
(when-not (s/blank? s)
243-
(->> (s/split (s/trim s) #"(?i)\band[/-\\]+or\b")
244-
(map-split-and-interpose #"(?i)(\band\b|\&)(?!\s+(distribution|all\s+rights\s+reserved))"
245-
:and)
246-
(map-split-and-interpose #"(?i)\bor\b(?!\s*(-?(greater|(any\s+)?later|(any\s+)?lator|(any\s+)?newer|lesser|library|\(?at\s+your\s+(option|discretion)\)?|([\"']?(Revised|Modified)[\"']?))))"
247-
:or)
248-
(map-split-and-interpose #"(?i)\b(with\b|w/)(?!\s+the\s+acknowledgment\s+clause\s+removed)"
249-
:with)
250-
(map-split-and-interpose #"(?i)(?<=CDDL)/(?=GPL)" ; Special case for splitting particularly cursed combos such as CDDL/GPLv2+CE
251-
nil)
252-
filter-blanks
253-
(map #(if (string? %) (s/trim %) %)))))
254-
255218
(defn- fix-unidentified
256219
"Fixes a singleton UNIDENTIFIED- expression info map by converting the id to
257220
either a lice-comb unidentified LicenseRef or AdditionRef, depending on prev.
@@ -341,16 +304,13 @@
341304
(recur (process-expression-element result f) (first r) (rest r))
342305
(manual-fixes (into {} result)))))
343306

344-
(defn name->expressions-info
345-
"Returns an expressions-info map for the given license name."
307+
(defn parse-name
308+
"Parses the given license `n`ame, returning an expressions-info map."
346309
[n]
347310
(when-not (s/blank? n)
348311
(let [n (s/trim n)
349312
partial-result (some->> n
350-
split-on-operators ; Split on operators
351-
(drop-while keyword?) ; Drop (nonsensical) leading operators
352-
(lc3/rdrop-while keyword?) ; Drop (nonsensical) trailing operators
353-
dedupe ; Deduplicate consecutive identical values (mostly applies to duplicate operators, which are redundant)
313+
lcisp/split-on-operators ; Split on operators
354314
(map #(if (keyword? %) % (string->ids-info %))) ; Determine SPDX ids (or UNIDENTIFIED-xxx) with info for all non-operators
355315
flatten ; Flatten back to an unnested sequence (since string->ids-info returns sequences)
356316
fix-unidentifieds ; Convert each unidentified non-operator into either a LicenseRef or AdditionRef, depending on context
@@ -374,7 +334,7 @@
374334
Note: this method has a substantial performance cost."
375335
[]
376336
(lcis/init!)
377-
(lcirm/init!)
337+
(lciid/init!)
378338
(lcihttp/init!)
379339
@cursed-names-d
380340
nil)

src/lice_comb/impl/spdx.clj

+28-8
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"SPDX-related functionality. Note: this namespace is not part of the public
2121
API of lice-comb and may change without notice."
2222
(:require [clojure.string :as s]
23+
[embroidery.api :as e]
2324
[spdx.licenses :as sl]
2425
[spdx.exceptions :as se]
2526
[spdx.expressions :as sexp]
@@ -52,15 +53,28 @@
5253
(def ^:private unidentified-addition-ref-prefix (str lice-comb-addition-ref-prefix "-UNIDENTIFIED"))
5354

5455
; Lower case id map
55-
(def spdx-ids-d (delay (merge (into {} (map #(vec [(s/lower-case %) %]) @license-ids-d))
56-
(into {} (map #(vec [(s/lower-case %) %]) @exception-ids-d)))))
56+
(def ^:private spdx-ids-d (delay (merge (into {} (map #(vec [(s/lower-case %) %]) @license-ids-d))
57+
(into {} (map #(vec [(s/lower-case %) %]) @exception-ids-d)))))
58+
59+
(defn near-match-id
60+
"Returns the (case-corrected) id for the given license or exception id `id`,
61+
or `nil` if one wasn't found."
62+
[id]
63+
(get @spdx-ids-d (s/lower-case id)))
5764

5865
(defn- name-to-id-tuple
5966
[list-entry]
6067
[(s/lower-case (s/trim (:name list-entry))) (:id list-entry)])
6168

62-
(def index-name-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @license-list-d)))
63-
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @exception-list-d))))))
69+
(def ^:private index-name-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @license-list-d)))
70+
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @exception-list-d))))))
71+
72+
;####TODO: REPLACE THIS WITH REGEX BASED NEAR-MATCHING (to account for whitespace variance and #"licen[cs]e", for example)
73+
(defn near-match-name
74+
"Returns the id(s) for the given license or exception name `n`, or `nil` if
75+
no ids were found."
76+
[n]
77+
(get @index-name-to-id-d (s/lower-case n)))
6478

6579
(defn- urls-to-id-tuples
6680
"Extracts all urls for a given list (license or exception) entry."
@@ -69,8 +83,14 @@
6983
simplified-uris (map lciu/simplify-uri (filter (complement s/blank?) (concat (:see-also list-entry) (get-in list-entry [:cross-refs :url]))))]
7084
(map #(vec [% id]) simplified-uris)))
7185

72-
(def index-uri-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @license-list-d)))
73-
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @exception-list-d))))))
86+
(def ^:private index-uri-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @license-list-d)))
87+
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @exception-list-d))))))
88+
89+
(defn near-match-uri
90+
"Returns the id(s) for the given license or exception `uri`, or `nil` if no
91+
ids were found."
92+
[uri]
93+
(get @index-uri-to-id-d (lciu/simplify-uri uri)))
7494

7595
(defn lice-comb-license-ref?
7696
"Is the given id one of lice-comb's custom LicenseRefs?"
@@ -209,8 +229,8 @@
209229
Note: this method has a substantial performance cost."
210230
[]
211231
; Parallelise initialisation of the spdx.licenses and spdx.exceptions namespaces, as they're both sloooooooow (~1.5 mins total)
212-
(let [sl-init (future (sl/init!))
213-
se-init (future (se/init!))]
232+
(let [sl-init (e/future* (sl/init!))
233+
se-init (e/future* (se/init!))]
214234
@sl-init
215235
@se-init)
216236
(sexp/init!)

0 commit comments

Comments
 (0)