Skip to content

Commit b208003

Browse files
committed
🚧 WIP on replace-first algorithm - case insensitivity tests
1 parent acceb84 commit b208003

File tree

4 files changed

+20
-26
lines changed

4 files changed

+20
-26
lines changed

src/lice_comb/impl/regexes.clj

+17-23
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,12 @@
5050
fre-ows
5151
(re/ncg "versionNumber"
5252
(s/join "\\." (map #(str "0*" %) non-zero-version-components))
53-
#"(?:\.0+)*") ; Allow any number of ".0" to appear at the end
53+
#"(?:\.0+)*") ; Allow any number of ".0" to appear at the end
5454
fre-ows
5555
(case [only? or-later?]
56-
[true false] (re/opt fre-only)
57-
[false true] (re/opt fre-or-later)
58-
(re/opt fre-only-or-later))
59-
;####TODO: REMOVE
60-
; (if only?
61-
; (re/opt-grp fre-only)
62-
; (re/opt-grp fre-or-later))
63-
)))
56+
[true false] (re/opt fre-only) ; only only
57+
[false true] (re/opt fre-or-later) ; or-later only
58+
(re/opt fre-only-or-later))))) ; Undefined, so accept either
6459

6560
; Note: some of the regexes in this namespace uses classes (e.g. [\\/-\s]{1,4}) instead of alternation (e.g. (\\|/|-|\s){1,4}) due to an apparent bug in the JVM's regex libraries when
6661
; the latter are used in look-behind groups. See https://stackoverflow.com/questions/24874404/java-regex-look-behind-group-does-not-have-obvious-maximum-length-error/24922107
@@ -77,14 +72,16 @@
7772
(lciu/replace-in-coll #"(?i)\-(?<versionNumber>\d+\.\d+(?:\.\d+)*)(?:(?<only>-only)|(?<orLater>\+|-or-later))?(?=(-|\z))"
7873
#(re/join #"[\s\-–—]*" (re-version-replacement %))) ; Note: we handle leading whitespace slightly differently in id regexes vs name regexes
7974
; Special cases for certain licenses
80-
(lciu/replace-in-coll #"(?i)(?<!\w)AGPL(?!\w)" #"(?:GNU[\s\-–—]+)?A[\s\-–—]*GPL")
81-
(lciu/replace-in-coll #"(?i)(?<!\w)LGPL(?!\w)" #"(?:GNU[\s\-–—]+)?L[\s\-–—]*GPL")
82-
(lciu/replace-in-coll #"(?i)(?<!\w)GPL(?!\w)" #"(?:GNU[\s\-–—]+)?[\s\-–—]*GPL")
75+
;####TODO: TEST WHETHER THIS IS EVEN NEEDED
76+
; (lciu/replace-in-coll #"(?i)(?<!\w)AGPL(?!\w)" #"(?:GNU[\s\-–—]+)?A[\s\-–—]*GPL")
77+
; (lciu/replace-in-coll #"(?i)(?<!\w)LGPL(?!\w)" #"(?:GNU[\s\-–—]+)?L[\s\-–—]*GPL")
78+
; (lciu/replace-in-coll #"(?i)(?<!\w)GPL(?!\w)" #"(?:GNU[\s\-–—]+)?[\s\-–—]*GPL")
8379
(lciu/replace-in-coll #"(?i)(?<!\w)MIT(?!\w)" #"(?<!(?:X11|ISC)[\\/\-\s]{1,4})MIT(?![\\/\-\s]{1,4}(?:X11|ISC))")
8480
(lciu/replace-in-coll #"(?i)(?<!\w)X11(?!\w)" #"(?:MIT[\\/\-\s]{1,4})?X11(?:[\\/\-\s]{1,4}MIT)?")
8581
(lciu/replace-in-coll #"(?i)(?<!\w)ISC(?!\w)" #"(?:MIT[\\/\-\s]{1,4})?ISC(?:[\\/\-\s]{1,4}MIT)?")
8682
(lciu/replace-in-coll #"(?i)(?<!\w)(?<!zlib/)libpng(?!\w)" #"(?<!zlib/[\\/\-\s]{1,4})libpng(?![\\/\-\s]{1,4}zlib)")
87-
(lciu/replace-in-coll #"(?i)BSD\-(?<clauseCount>\d+)\-Clause" (fn [m] (re/join #"BSD[\s\-–—]*0*" (get m "clauseCount") #"[\s\-–—]*Clause"))) ; For BSD
83+
;####TODO: TEST WHETHER THIS IS EVEN NEEDED
84+
; (lciu/replace-in-coll #"(?i)BSD\-(?<clauseCount>\d+)\-Clause" (fn [m] (re/join #"BSD[\s\-–—]*0*" (get m "clauseCount") #"[\s\-–—]*Clause"))) ; For BSD
8885
; Character equivalents
8986
(lciu/replace-in-coll #"[\s\-]+" #"[\s\-–—]+") ; Note: hyphen, en-dash, em-dash
9087
; Cleanup and combine into a single pattern
@@ -100,12 +97,12 @@
10097
(-> [#"(?iuU)(?<!\w)(The[\s\-–—]+)?" (s/trim n) #"(?!\w)"]
10198
;####TODO: TEST WHETHER THESE ARE EVEN NEEDED
10299
; Special case GNU family first, as they're such a massive pita
103-
(lciu/replace-in-coll #"(?i)(?<!\w)GNU\s+" #"(?:GNU[\s\-–—]+)?")
104-
(lciu/replace-in-coll #"(?i)(?<!\w)Affero General Public License" #"Affero[\s\-–—]+Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e(?:[\s\-–—]+\(?A[\s\-–—]*GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
105-
(lciu/replace-in-coll #"(?i)(?<!\w)Library General Public License" #"(?:Library|Less[eo]r|Library[\s\-–—]+or[\s\-–—]+Less[eo]r|Less[eo]r[\s\-–—]+or[\s\-–—]+Library)[\s\-–—]+Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e(?:[\s\-–—]+\(?L[\s\-–—]*GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
106-
(lciu/replace-in-coll #"(?i)(?<!\w)Lesser General Public License" #"(?:Library|Less[eo]r|Library[\s\-–—]+or[\s\-–—]+Less[eo]r|Less[eo]r[\s\-–—]+or[\s\-–—]+Library)[\s\-–—]+Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e(?:[\s\-–—]+\(?L[\s\-–—]*GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
107-
(lciu/replace-in-coll #"(?i)(?<!\w)General Public License" #"Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e([\s\-–—]+\(?GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
108-
(lciu/replace-in-coll #"(?i)(?<!\w)\"Original\" or \"Old\" License" #"(\"?Original\"?(?:[\s\-–—]+or[\s\-–—]+\"?Old\"?)?(?:[\s\-–—]+Licen[cs]e)?)?") ; BSD-4-Clause
100+
; (lciu/replace-in-coll #"(?i)(?<!\w)GNU\s+" #"(?:GNU[\s\-–—]+)?")
101+
; (lciu/replace-in-coll #"(?i)(?<!\w)Affero General Public License" #"Affero[\s\-–—]+Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e(?:[\s\-–—]+\(?A[\s\-–—]*GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
102+
; (lciu/replace-in-coll #"(?i)(?<!\w)Library General Public License" #"(?:Library|Less[eo]r|Library[\s\-–—]+or[\s\-–—]+Less[eo]r|Less[eo]r[\s\-–—]+or[\s\-–—]+Library)[\s\-–—]+Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e(?:[\s\-–—]+\(?L[\s\-–—]*GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
103+
; (lciu/replace-in-coll #"(?i)(?<!\w)Lesser General Public License" #"(?:Library|Less[eo]r|Library[\s\-–—]+or[\s\-–—]+Less[eo]r|Less[eo]r[\s\-–—]+or[\s\-–—]+Library)[\s\-–—]+Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e(?:[\s\-–—]+\(?L[\s\-–—]*GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
104+
; (lciu/replace-in-coll #"(?i)(?<!\w)General Public License" #"Genere?al[\s\-–—]+Pub?lic[\s\-–—]+Licen[cs]e([\s\-–—]+\(?GPL(?:[\s\-–—]*v)?[\s\d\._]*\))?")
105+
; (lciu/replace-in-coll #"(?i)(?<!\w)\"Original\" or \"Old\" License" #"(\"?Original\"?(?:[\s\-–—]+or[\s\-–—]+\"?Old\"?)?(?:[\s\-–—]+Licen[cs]e)?)?") ; BSD-4-Clause
109106
; Special cases for certain licenses
110107
(lciu/replace-in-coll #"(?i)(?<!\w)Apache(?!\w)" #"Apache(?:[\s\-–—]*Software)?")
111108
(lciu/replace-in-coll #"(?i)(?<!\w)Creative Commons(?!\w)" #"(?:Creative[\s\-–—]*Commons|CC)")
@@ -147,9 +144,6 @@
147144
(lciu/replace-in-coll #"(?i)hardware(?!\w)" #"(?:Hardware)?")
148145
(lciu/replace-in-coll #"(?i)\s+generic(?!\w)" #"(?:[\s\-–—]+Generic)?")
149146
(lciu/replace-in-coll #"(?i)generic(?!\w)" #"(?:Generic)?")
150-
; Note: can't do this due to 'Linux man-pages Copyleft Variant' and 'Linux man-pages Copyleft'
151-
; (lciu/replace-in-coll #"(?i)\s+variant(?!\w)" #"(?:[\s\-–—]+Variant)?")
152-
; (lciu/replace-in-coll #"(?i)variant(?!\w)" #"(?:Variant)?")
153147
(lciu/replace-in-coll #"(?i)\s+international(?!\w)" #"(?:[\s\-–—]+International)?")
154148
(lciu/replace-in-coll #"(?i)international(?!\w)" #"(?:International)?")
155149
; Alternative spellings
@@ -172,7 +166,7 @@
172166
(lciu/replace-in-coll #"(?i)(?<!\w)(and|&)(?!\w)" #"(?:and|&)")
173167
; Character equivalents
174168
(lciu/replace-in-coll #"(?i)é" #"[ée]") ; As of License List v3.26.0 'é' is the only accented character present
175-
(lciu/replace-in-coll #"\"" #"[\"“”„‟'‘’‚‛`]")
169+
(lciu/replace-in-coll #"\"" fre-quote)
176170
(lciu/replace-in-coll #"\s*/\s*" #"\s*[\\/\-–—]\s*") ; hyphen, en-dash, em-dash
177171
(lciu/replace-in-coll #"[\s\-–]+" #"[\s\-–—]+") ; hyphen, en-dash, em-dash. en-dash is in e.g. the name of LiLiQ-R-1.1
178172
(lciu/replace-in-coll #"[\(\[\{«‹]+" #"[\(\[\{«‹]*") ; Make parens optional

src/lice_comb/impl/substitutions/cpe.clj

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
(def ^:private pairs-d (delay (concat
2424
(lcisu/spdx-match-pairs @ids-d) ; Generic license regexes handle most cases, except...
25-
[[(re/join #"(?i)(GNU[\s\-–—]*)?(?:CPE|Classpath[\s\-–—]+exception)") ; ...when no version is provided (and note that exceptions can't have "only", "+", "or later", etc.)
25+
[[(re/join #"(?iuU)(GNU[\s\-–—]*)?(?:CPE|Classpath[\s\-–—]+exception)") ; ...when no version is provided (and note that exceptions can't have "only", "+", "or later", etc.)
2626
(fn [m]
2727
{:id "Classpath-exception-2.0"
2828
:type :concluded

src/lice_comb/impl/substitutions/hippocratic.clj

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
(def ^:private pairs-d (delay (concat
2525
(lcisu/spdx-match-pairs @ids-d) ; Generic license regexes handle most cases, except...
26-
[[(re/join #"(?i)Hippocratic([\s\-–—]+Licen?[cs]e)?" (re/opt-grp lcir/fre-ows lcir/fre-only-or-later)) ; ...when no version is provided
26+
[[(re/join #"(?iuU)Hippocratic([\s\-–—]+Licen?[cs]e)?" (re/opt-grp lcir/fre-ows lcir/fre-only-or-later)) ; ...when no version is provided
2727
(fn [m]
2828
{:id (str "Hippocratic-2.1" (when (get m "orLater") "+"))
2929
:type :concluded

src/lice_comb/impl/substitutions/mpl.clj

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323

2424
(def ^:private pairs-d (delay (concat
2525
(lcisu/spdx-match-pairs @ids-d) ; Generic license regexes handle most cases, except...
26-
[[(re/join #"(?i)(?:MPL|Mozilla([\s\-–—]+Public)?([\s\-–—]+Licen?[cs]e)?)" (re/opt-grp lcir/fre-ows lcir/fre-only-or-later)) ; ...when no version is provided
26+
[[(re/join #"(?iuU)(?:MPL|Mozilla([\s\-–—]+Public)?([\s\-–—]+Licen?[cs]e)?)" (re/opt-grp lcir/fre-ows lcir/fre-only-or-later)) ; ...when no version is provided
2727
(fn [m]
2828
{:id (str "MPL-2.0" (when (get m "orLater") "+"))
2929
:type :concluded

0 commit comments

Comments
 (0)