Skip to content

Commit 27b72fa

Browse files
committed
🚑 Improve logic for URL simplification
1 parent cb0aea0 commit 27b72fa

File tree

5 files changed

+33
-11
lines changed

5 files changed

+33
-11
lines changed

src/lice_comb/impl/regex_matching.clj

+8-1
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@
113113
"attribution" "Attribution"
114114
"clear" "Clear"
115115
"lbnl" "LBNL"
116+
"hp" "HP"
117+
"sun" "Sun"
118+
"flex" "flex"
119+
"freebsd" "FreeBSD"
120+
"netbsd" "NetBSD"
116121
"modification" "Modification"
117122
("no military license" "no military licence") "No-Military-License"
118123
("no nuclear license" "no nuclear licence") "No-Nuclear-License"
@@ -121,6 +126,8 @@
121126
"open mpi" "Open-MPI"
122127
"shortened" "Shortened"
123128
"uc" "UC"
129+
"darwin" "Darwin"
130+
"acpica" "acpica"
124131
nil)
125132
base-id (str (:id m) "-" clause-count "-Clause")
126133
id-with-suffix (str base-id "-" suffix)]
@@ -251,7 +258,7 @@
251258
:pad-ver? true
252259
:latest-ver "1.0"}
253260
{:id "BSD"
254-
:regex #"(?i)\b(?<clausecount1>\p{Alnum}+)?[\s,-]*(C(lause)?|Type)?\s*\bBSD[\s-]*\(?(Licen[cs]e|Type|C(lause)?)?[\s-]*(?<clausecount2>\p{Alnum}+)?([\s-]+Clause)?(?<suffix>\s+(Patent|Views|Attribution|Clear|LBNL|Modification|No\s+Military\s+Licen[cs]e|No\s+Nuclear\s+Licen[cs]e([\s-]+2014)?|No\s+Nuclear\s+Warranty|Open\s+MPI|Shortened|UC))?"
261+
:regex #"(?i)\b(?<clausecount1>\p{Alnum}+)?[\s,-]*(C(lause)?|Type)?\s*\bBSD[\s-]*\(?(Licen[cs]e|Type|C(lause)?)?[\s-]*(?<clausecount2>\p{Alnum}+)?([\s-]+Clause)?(?<suffix>\s+(Patent|Views|Attribution|Clear|LBNL|HP|Sun|flex|FreeBSD|NetBSD|Modification|No\s+Military\s+Licen[cs]e|No\s+Nuclear\s+Licen[cs]e([\s-]+2014)?|No\s+Nuclear\s+Warranty|Open\s+MPI|Shortened|UC|Darwin|acpica))?"
255262
:fn bsd-id-constructor}
256263
{:id "CC0"
257264
:regex #"(?i)\bCC\s*0"

src/lice_comb/impl/utils.clj

+4-2
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,10 @@
128128
(let [luri (s/lower-case (s/trim uri))]
129129
(if (valid-http-uri? luri)
130130
(-> luri
131-
(s/replace #"\Ahttps?://(www\.)?" "http://") ; Normalise to http and strip any www. extension on hostname
132-
(s/replace #"\.[\p{Alnum}]{3,}\z" "")) ; Strip file type extension (if any)
131+
(s/replace #"\Ahttps?://(www\.)?" "http://") ; Normalise to http and strip any www. extension on hostname
132+
(s/replace #"licen[cs]es?" "license") ; Alternative spelling and plurals of "license"
133+
(s/replace #"\.[\p{Alnum}]{3,}\z" "") ; Strip file type extension (if any)
134+
(s/replace #"/+\z" "")) ; Strip all trailing forward slash (/) characters
133135
luri)))))
134136

135137
(defn readable-dir?

test/lice_comb/impl/utils_test.clj

+11-4
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
(use-fixtures :once fixture)
2626

27-
(def simplified-apache2-uri "http://apache.org/licenses/license-2.0")
27+
(def simplified-apache2-uri "http://apache.org/license/license-2.0")
2828

2929
(deftest simplify-uri-tests
3030
(testing "Nil, empty or blank values"
@@ -43,8 +43,8 @@
4343
(is (= "mailto:someone@example.com?subject=this%20is%20the%20subject&cc=someone_else@example.com&body=this%20is%20the%20body"
4444
(simplify-uri "mailto:someone@example.com?subject=This%20is%20the%20subject&cc=someone_else@example.com&body=This%20is%20the%20body"))))
4545
(testing "Valid uris that don't get simplified"
46-
(is (= simplified-apache2-uri (simplify-uri simplified-apache2-uri)))
47-
(is (= "http://creativecommons.org/licenses/by-sa/4.0/legalcode" (simplify-uri "http://creativecommons.org/licenses/by-sa/4.0/legalcode"))))
46+
(is (= simplified-apache2-uri (simplify-uri simplified-apache2-uri)))
47+
(is (= "http://creativecommons.org/license/by-sa/4.0/legalcode" (simplify-uri "http://creativecommons.org/licenses/by-sa/4.0/legalcode"))))
4848
(testing "Valid uris that get simplified"
4949
(is (= simplified-apache2-uri (simplify-uri "http://www.apache.org/licenses/LICENSE-2.0")))
5050
(is (= simplified-apache2-uri (simplify-uri "https://www.apache.org/licenses/LICENSE-2.0")))
@@ -55,7 +55,14 @@
5555
(is (= simplified-apache2-uri (simplify-uri "https://www.apache.org/licenses/license-2.0.txt")))
5656
(is (= simplified-apache2-uri (simplify-uri "http://apache.org/licenses/LICENSE-2.0.pdf")))
5757
(is (= simplified-apache2-uri (simplify-uri " http://www.apache.org/licenses/LICENSE-2.0.html ")))
58-
(is (= "http://gnu.org/licenses/agpl" (simplify-uri "https://www.gnu.org/licenses/agpl.txt")))
58+
(is (= "http://gnu.org/license/agpl" (simplify-uri "https://www.gnu.org/licenses/agpl.txt")))
59+
(is (= "http://opensource.org/license/mit" (simplify-uri "https://opensource.org/license/MIT")))
60+
(is (= "http://opensource.org/license/mit" (simplify-uri "https://opensource.org/license/MIT/")))
61+
(is (= "http://opensource.org/license/mit" (simplify-uri "https://opensource.org/license/mit/")))
62+
(is (= "http://opensource.org/license/mit" (simplify-uri "https://opensource.org/license/MIT.TXT")))
63+
(is (= "http://opensource.org/license/mit" (simplify-uri "https://opensource.org/licence/MIT")))
64+
(is (= "http://opensource.org/license/mit" (simplify-uri "https://opensource.org/licenses/MIT")))
65+
(is (= "http://opensource.org/license/mit" (simplify-uri "https://opensource.org/licences/MIT")))
5966
(is (= "http://gnu.org/software/classpath/license" (simplify-uri "https://www.gnu.org/software/classpath/license.html")))
6067
(is (= "http://raw.githubusercontent.com/pmonks/lice-comb/main/license" (simplify-uri "https://raw.githubusercontent.com/pmonks/lice-comb/main/LICENSE")))
6168
(is (= "http://github.com/pmonks/lice-comb/blob/main/license" (simplify-uri "https://github.com/pmonks/lice-comb/blob/main/LICENSE")))))

test/lice_comb/matching_test.clj

+6-3
Original file line numberDiff line numberDiff line change
@@ -101,12 +101,15 @@
101101
(is (valid= #{"GPL-2.0-only WITH Classpath-exception-2.0"} (name->expressions "GPL-2.0 WITH Classpath-exception-2.0")))
102102
(is (valid= #{"Apache-2.0 OR GPL-3.0-only"} (name->expressions "Apache-2.0 OR GPL-3.0")))
103103
(is (valid= #{"EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0 OR MIT OR (BSD-3-Clause AND Apache-2.0)"} (name->expressions "EPL-2.0 OR (GPL-2.0+ WITH Classpath-exception-2.0) OR MIT OR (BSD-3-Clause AND Apache-2.0)"))))
104-
(testing "Single expressions that are not valid SPDX"
104+
(testing "Single expressions that are not SPDX expressions"
105105
(is (valid= #{"GPL-2.0-only WITH Classpath-exception-2.0"} (name->expressions "GNU General Public License, version 2 with the GNU Classpath Exception")))
106106
(is (valid= #{"Apache-2.0 OR GPL-3.0-only"} (name->expressions "Apache License version 2.0 or GNU General Public License version 3")))
107107
(is (valid= #{"EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0 OR MIT OR (BSD-3-Clause AND Apache-2.0)"} (name->expressions "EPL-2.0 OR (GPL-2.0+ WITH Classpath-exception-2.0) OR MIT OR (BSD-3-Clause AND Apache-2.0)")))
108108
(is (valid= #{"Apache-2.0 AND MIT"} (name->expressions "Apache & MIT licence")))
109-
(is (valid= #{"CDDL-1.1"} (name->expressions "Common Development and Distribution Licence"))))
109+
(is (valid= #{"CDDL-1.1"} (name->expressions "Common Development and Distribution Licence")))
110+
(is (valid= #{"BSD-2-Clause-FreeBSD"} (name->expressions "BSD 2 clause freebsd")))
111+
(is (valid= #{"BSD-2-Clause-Darwin"} (name->expressions "BSD 2 clause darwin"))) ; Since SPDX v3.23
112+
(is (valid= #{"BSD-3-Clause-acpica"} (name->expressions "BSD 3 CLAUSE ACPICA")))) ; Since SPDX v3.23
110113
(testing "Expressions with weird operators"
111114
(is (valid= #{"Apache-2.0"} (name->expressions "and and and Apache License 2.0")))
112115
(is (valid= #{"Apache-2.0"} (name->expressions "Apache Licence 2.0 or or or")))
@@ -119,7 +122,7 @@
119122
(is (valid= #{"Apache-2.0" "GPL-3.0-only"} (name->expressions "Apache License version 2.0 / GNU General Public License version 3")))
120123
(is (valid= #{"Apache-2.0" "GPL-3.0-only WITH Classpath-exception-2.0"} (name->expressions "Apache License version 2.0 / GNU General Public License version 3 with classpath exception")))
121124
(is (valid= #{"EPL-2.0 OR (GPL-2.0-or-later WITH Classpath-exception-2.0 AND MIT) OR (BSD-3-Clause AND Apache-2.0)"} (name->expressions "Eclipse Public License or General Public License 2.0 or (at your discretion) later w/ classpath exception aNd MIT Licence or three clause bsd and Apache Licence"))))
122-
(testing "Messed up license expressions"
125+
(testing "Cursed license expressions"
123126
(is (valid= #{"Apache-2.0" "MIT"} (name->expressions "Apache with MIT"))))
124127
(testing "Names seen in handpicked POMs on Maven Central"
125128
(is (valid= #{"AGPL-3.0-only"} (name->expressions "GNU Affero General Public License (AGPL) version 3.0")))

test/lice_comb/test_boilerplate.clj

+4-1
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@
1818

1919
(ns lice-comb.test-boilerplate
2020
(:require [clojure.spec.alpha :as spec]
21+
[spdx.licenses :as slic]
2122
[spdx.expressions :as sexp]))
2223

23-
(println "\n☔️ Running tests on Clojure" (clojure-version) "/ JVM" (System/getProperty "java.version") (str "(" (System/getProperty "java.vm.name") " v" (System/getProperty "java.vm.version") ")\n"))
24+
(println "\n☔️ Running tests on Clojure" (clojure-version)
25+
"/ JVM" (System/getProperty "java.version") (str "(" (System/getProperty "java.vm.name") " " (System/getProperty "java.vm.version") ")")
26+
"/ SPDX License List" (slic/version) "\n")
2427

2528
(println "⚠️ Note: these tests take between 5 and 10 minutes 🐢")
2629

0 commit comments

Comments
 (0)