Finalising release 3.0

mzattera · mzattera · commit 1ff659458b99 · 2021-09-09T14:41:18.000+02:00
diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ punctuation) one of them is chosen as the default "space" character (returned by
 
   Special characters also include "unreadable" characters that are used (e.g. in the EVA alphabet) to mark illegible characters in the original text.
 		
-The `Alphabet` class is abstract; to provide an actual implementation simply extend this class and provide methods that 
+The `Alphabet` class is abstract; to provide an actual implementation, simply extend this class and provide methods that 
 list characters accordingly to their category.
 
 The `Alphabet` class provides some static fields to access already defined alphabets:
@@ -98,13 +98,25 @@ which contains IVTFF metadata for the line, namely the locus identifier and the
 copies of the same line exists with different transcribers.
 
 In addition to inherited methods `filterElements()` and `splitElements()`, the methods `filterPages()`, `filterLines()`, `splitPages()`, and `splitLines()`
-can be used to create IVTFF documents by filtering and/or splitting content of an existing document. Again, please refer to JavaDoc fro more details.
+can be used to create IVTFF documents by filtering and/or splitting content of an existing document. Again, please refer to JavaDoc for more details.
+Also notice that, based on [working note 003](https://mzattera.github.io/v4j/003/), `PageHeader` exposes a cluster for each page in the manuscript;
+this information can be used to filter or split the manuscripts into clusters.
 
 ```Java
-/* Get all biological pages (MAJORITY transcription) */
+/* Get a document containing all and only biological pages (MAJORITY transcription) */
 
 IvtffText doc = VoynichFactory.getDocument(TranscriptionType.MAJORITY);
 doc = doc.filterPages(new PageFilter.Builder().illustrationType("B").build());
+
+/*
+Split the manuscript into clusters (see https://mzattera.github.io/v4j/003/)
+
+clusterMap will match any cluster name (see PageHeader.CLUSTERS) with a IvfttText 
+with pages in that cluster.
+*/
+
+IvtffText = VoynichFactory.getDocument(TranscriptionType.CONCORDANCE);
+Map<String, IvtffText> clusterMap = doc.splitPages(new PageSplitter.Builder().byCluster().build());
 ```
 
 ### Other (Regular) Texts - `io.github.mattera.v4j.text.txt`
diff --git a/docs/index.md b/docs/index.md
@@ -30,7 +30,13 @@ In other terms, a token is an instance of a term. For example the below line in
 - [Note 002 - Some Basic Statistics](./002)
 
   An Excel file with basic page statistics, useful to build pivots.
+  
+- [Note 003 - Clustering](./003)
 
+  Application of t-SNE visualization and K-Means clustering to the Voynich, showing how page with same illustration type and
+  Courier's language also share similar words.
+  
+  This should be considered when applying statistical analysis methods to the manuscript.
 
 
 ---
diff --git a/eclipse/io.github.mattera.v4j/.classpath b/eclipse/io.github.mattera.v4j/.classpath
@@ -25,7 +25,6 @@
 	</classpathentry>
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
 		<attributes>
-			<attribute name="module" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
diff --git a/eclipse/io.github.mattera.v4j/src/main/java/io/github/mattera/v4j/text/alphabet/Alphabet.java b/eclipse/io.github.mattera.v4j/src/main/java/io/github/mattera/v4j/text/alphabet/Alphabet.java
@@ -16,12 +16,10 @@
  */
 public abstract class Alphabet {
 
-	public final static Alphabet EVA = new Eva();
+	public final static Alphabet EVA = new EvaAlphabet();
 
 	public final static Alphabet UTF_16 = new JavaCharset();
 
-	public final static Alphabet SLOT = new Slot();
-
 	/**
 	 * @return a string code for this alphabet, same as that used in the IVTFF file.
 	 */
diff --git a/eclipse/io.github.mattera.v4j/src/main/java/io/github/mattera/v4j/text/alphabet/EvaAlphabet.java b/eclipse/io.github.mattera.v4j/src/main/java/io/github/mattera/v4j/text/alphabet/EvaAlphabet.java
@@ -12,7 +12,7 @@
  * @author Massimiliano "Maxi" Zattera
  */
 // TODO rename to EVA or EVA extended based on what we really support
-public final class Eva extends Alphabet {
+public final class EvaAlphabet extends Alphabet {
 
 	@Override
 	public String getCodeString() {
@@ -82,6 +82,6 @@ public char[] getUnreadableChars() {
 		return unreadableChars;
 	}
 
-	protected Eva() {
+	protected EvaAlphabet() {
 	}
 }
diff --git a/eclipse/io.github.mattera.v4j/src/main/java/io/github/mattera/v4j/text/alphabet/Slot.java b/eclipse/io.github.mattera.v4j/src/main/java/io/github/mattera/v4j/text/alphabet/Slot.java
diff --git a/eclipse/io.github.mzattera.v4j-apps/.classpath b/eclipse/io.github.mzattera.v4j-apps/.classpath
@@ -26,7 +26,6 @@
 	</classpathentry>
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
 		<attributes>
-			<attribute name="module" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`* @author Massimiliano "Maxi" Zattera`
`13`	`13`	`*/`
`14`	`14`	`// TODO rename to EVA or EVA extended based on what we really support`
`15`		`-public final class Eva extends Alphabet {`
	`15`	`+public final class EvaAlphabet extends Alphabet {`
`16`	`16`
`17`	`17`	`@Override`
`18`	`18`	`public String getCodeString() {`
`@@ -82,6 +82,6 @@ public char[] getUnreadableChars() {`
`82`	`82`	`return unreadableChars;`
`83`	`83`	`}`
`84`	`84`
`85`		`- protected Eva() {`
	`85`	`+ protected EvaAlphabet() {`
`86`	`86`	`}`
`87`	`87`	`}`