mzattera
diff --git a/‎docs/005/index.md
Lines changed: 1 addition & 1 deletion b/‎docs/005/index.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/010/images/A.PNG
57.7 KB b/‎docs/010/images/A.PNG
57.7 KB
diff --git a/‎docs/010/images/B.PNG
58.4 KB b/‎docs/010/images/B.PNG
58.4 KB
diff --git a/‎docs/010/images/C.PNG
54.5 KB b/‎docs/010/images/C.PNG
54.5 KB
diff --git a/‎docs/010/images/SummaryTable.PNG
-35.7 KB b/‎docs/010/images/SummaryTable.PNG
-35.7 KB
diff --git a/‎docs/010/index.md
Lines changed: 98 additions & 171 deletions b/‎docs/010/index.md
Lines changed: 98 additions & 171 deletions
diff --git a/‎eclipse/io.github.mzattera.v4j-apps/src/main/java/io/github/mzattera/v4j/applications/chars/CharDistributionAnalysis.java
Lines changed: 11 additions & 5 deletions b/‎eclipse/io.github.mzattera.v4j-apps/src/main/java/io/github/mzattera/v4j/applications/chars/CharDistributionAnalysis.java
Lines changed: 11 additions & 5 deletions
diff --git a/‎eclipse/io.github.mzattera.v4j-apps/src/main/java/io/github/mzattera/v4j/applications/chars/CountGallows.java
Lines changed: 171 additions & 0 deletions b/‎eclipse/io.github.mzattera.v4j-apps/src/main/java/io/github/mzattera/v4j/applications/chars/CountGallows.java
Lines changed: 171 additions & 0 deletions
diff --git a/‎eclipse/io.github.mzattera.v4j-apps/src/main/java/io/github/mzattera/v4j/applications/chars/CurrierRepeatTest.java
Lines changed: 4 additions & 2 deletions b/‎eclipse/io.github.mzattera.v4j-apps/src/main/java/io/github/mzattera/v4j/applications/chars/CurrierRepeatTest.java
Lines changed: 4 additions & 2 deletions
diff --git a/‎resources/analysis/char distribution/Character Distribution.xlsx
2.28 KB b/‎resources/analysis/char distribution/Character Distribution.xlsx
2.28 KB
diff --git a/‎resources/analysis/char distribution/CountGallows.xlsx
3.82 KB b/‎resources/analysis/char distribution/CountGallows.xlsx
3.82 KB
@@ -164,7 +164,7 @@ they are ignored by most authors in any analysis of the text.
 
 Some glyphs (EVA 't', 'k', 'p' and 'f') appear taller than other characters and are traditionally referred to as "gallows".
 The combination 'ch' is instead called "pedestal". Some glyphs (EVA 'cth', 'ckh', 'cph' and 'cfh') appear visually as a 
-overlap of the pedestal with one of the gallows and are therefore called "pedestalled gallows".
+overlap of the pedestal with one of the gallows and are therefore called "pedestalled gallows" (or "split gallows").
 These glyphs appear in slots 3, 4, and 5 and are shown in the below table.
 
 ![Gallows and Pedestal](images/Gallows.PNG)
 
@@ -66,7 +66,7 @@ public abstract class CharDistributionAnalysis {
 	private static final double ALPHA2 = 0.05d;
 
 	/** Compact output */
-	private static final boolean COMPACT = false;
+	private static final boolean COMPACT = true;
 
 	protected CharDistributionAnalysis() {
 	}
@@ -120,15 +120,20 @@ private static void process(IvtffText doc, Experiment experiment) {
 		}
 
 		// Header with chars
-		System.out.print("Cluster;Significance [alpha];");
+		System.out.print("Cluster;");
+		if (!COMPACT)
+			System.out.print("Significance [alpha];");
 		for (int i = 0; i < chars.length; ++i)
 			System.out.print(chars[i] + ";");
 		System.out.println();
 
 		// Do analysis for each cluster
 		for (String cluster : PageHeader.CLUSTERS) {
 
-// TEST			IvtffText clusterText = doc.filterPages(new PageFilter.Builder().cluster(cluster).build()).shuffledText();
+			// TEST with random text
+			// IvtffText clusterText = doc.filterPages(new
+			// PageFilter.Builder().cluster(cluster).build()).shuffledText();
+
 			IvtffText clusterText = doc.filterPages(new PageFilter.Builder().cluster(cluster).build());
 			process(cluster, clusterText, COMPACT, experiment);
 
@@ -185,7 +190,8 @@ public static List<Character>[] process(String cluster, IvtffText txt, boolean c
 		// there is a significative difference
 		System.out.print(cluster + ";");
 		double confidence = ChiSquared.chiSquareTestDataSetsComparison(parts[0], parts[1], adjustedDistribution, false);
-		System.out.printf("%.2f%%;", confidence * 100);
+		if (!COMPACT)
+			System.out.printf("%.2f%%;", confidence * 100);
 		if (confidence > ALPHA) { // Not a difference in the two part that is statistically significant; exit
 			return result;
 		}
@@ -213,7 +219,7 @@ public static List<Character>[] process(String cluster, IvtffText txt, boolean c
 			// as a minimum?
 //			double expectedCount = cd.getFrequency()
 //					* Math.min(parts[0].getChars().getTotalCounted(), parts[1].getChars().getTotalCounted());
-			
+
 			// Observe actual frequencies of the char (2 categories: the char and all
 			// others)
 			long[] obs1 = ChiSquared.observe(parts[0], chars[i], false);
 
@@ -0,0 +1,171 @@
+/**
+ * 
+ */
+package io.github.mzattera.v4j.applications.chars;
+
+import java.util.List;
+import java.util.Map.Entry;
+
+import io.github.mzattera.v4j.experiment.Experiment;
+import io.github.mzattera.v4j.text.ElementFilter;
+import io.github.mzattera.v4j.text.Text;
+import io.github.mzattera.v4j.text.alphabet.Alphabet;
+import io.github.mzattera.v4j.text.ivtff.IvtffPage;
+import io.github.mzattera.v4j.text.ivtff.IvtffText;
+import io.github.mzattera.v4j.text.ivtff.LineFilter;
+import io.github.mzattera.v4j.text.ivtff.PageFilter;
+import io.github.mzattera.v4j.text.ivtff.PageHeader;
+import io.github.mzattera.v4j.text.ivtff.VoynichFactory;
+import io.github.mzattera.v4j.text.ivtff.VoynichFactory.Transcription;
+import io.github.mzattera.v4j.text.ivtff.VoynichFactory.TranscriptionType;
+import io.github.mzattera.v4j.util.Counter;
+
+/**
+ * Statistics about words starting or containing gallows, in different positions
+ * of the text.
+ * 
+ * 
+ * @author Massimiliano "Maxi" Zattera
+ *
+ */
+public final class CountGallows {
+
+	/**
+	 * Which transcription to use.
+	 */
+	public static final Transcription TRANSCRIPTION = Transcription.AUGMENTED;
+
+	/**
+	 * Which transcription type to use.
+	 */
+	public static final TranscriptionType TRANSCRIPTION_TYPE = TranscriptionType.MAJORITY;
+
+	/**
+	 * Which Alphabet type to use.
+	 */
+	public static final Alphabet ALPHABET = Alphabet.SLOT;
+
+	/** Filter to use on pages before analysis */
+	public static final ElementFilter<IvtffPage> FILTER = null;
+
+	private final static char[] GALLOWS = { 't', 'k', 'p', 'f', 'T', 'K', 'P', 'F', 'C', 'S' };
+
+	private CountGallows() {
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		try {
+			// Prints configuration parameters
+			System.out.println("Transcription     : " + TRANSCRIPTION);
+			System.out.println("Transcription Type: " + TRANSCRIPTION_TYPE);
+			System.out.println("Alphabet          : " + ALPHABET);
+			System.out.println("Filter            : " + (FILTER == null ? "<no-filter>" : FILTER)
+					+ " running text only (P0 & P1)");
+			System.out.println();
+
+			IvtffText doc = VoynichFactory.getDocument(TRANSCRIPTION, TRANSCRIPTION_TYPE, ALPHABET);
+			doc = doc.filterLines(LineFilter.PARAGRAPH_TEXT_FILTER);
+			if (FILTER != null)
+				doc = doc.filterPages(FILTER);
+
+			System.out.println("% of Tokens Containig Gallows");
+			process(doc, false);
+			System.out.println("\n\n% of Tokens Starting with Gallows");
+			process(doc, true);
+
+		} catch (Exception e) {
+			e.printStackTrace();
+		} finally {
+			System.out.println("\nCompleted.");
+		}
+
+	}
+
+	/**
+	 * @param doc
+	 * @param first If true count tokens starting with gallows, otherwise count
+	 *              tokens containing gallows.
+	 */
+	private static void process(IvtffText doc, boolean first) {
+		// Header
+		System.out.print("Locus;Cluster;");
+		for (char c : GALLOWS)
+			System.out.print(c + ";");
+		System.out.println();
+
+		for (String cluster : PageHeader.CLUSTERS) {
+			IvtffText clusterText = doc.filterPages(new PageFilter.Builder().cluster(cluster).build());
+
+			// Gets words in different positions;
+			// TODO NOTICE WE IGNORE UNREADABLE WORDS!!!!!
+
+			Text[] split = new Experiment.FirstLineInParagraph(false).splitDocument(clusterText);
+			IvtffText firstLines = (IvtffText)split[0];
+			List<Counter<String>> firstLineWords = Experiment.getWordsByPosition(firstLines, true);
+
+			// First words of paragraphs
+			Counter<String> firstWordsOfPar = firstLineWords.get(0);
+			System.out.print("First Word of a Paragraph;" + cluster + ";");
+			analize(firstWordsOfPar, first);
+
+			// Other words in first line
+			Counter<String> wordsInFirstLine = new Counter<>();
+			for (int i = 1; i < firstLineWords.size(); ++i) {
+				wordsInFirstLine.countAll(firstLineWords.get(i));
+			}
+			System.out.print("Remaining Words in First Line;" + cluster + ";");
+			analize(wordsInFirstLine, first);
+
+			List<Counter<String>> other = Experiment
+					.getWordsByPosition((IvtffText)split[1], true);
+
+			// First words in remaining lines
+			Counter<String> firstWordsInLine = other.get(0);
+			System.out.print("First Word of Other Lines;" + cluster + ";");
+			analize(firstWordsInLine, first);
+
+			// All other words
+			Counter<String> rest = new Counter<>();
+			for (int i = 1; i < other.size(); ++i) {
+				rest.countAll(other.get(i));
+			}
+			System.out.print("Remaining Words;" + cluster + ";");
+			analize(rest, first);
+		} // for each cluster
+	}
+
+	/**
+	 * Prints relevant % for given word distribution.
+	 * 
+	 * @param wordsInFirstLine
+	 * @param first            If true count tokens starting with gallows, otherwise
+	 *                         count tokens containing gallows.
+	 */
+	private static void analize(Counter<String> words, boolean first) {
+
+		// How many tokens with given gallows
+		Counter<Character> tokens = new Counter<>();
+
+		for (Entry<String, Integer> e : words.entrySet()) {
+			for (char c : GALLOWS) {
+				if ((first && (e.getKey().charAt(0) == c)) || (!first && (e.getKey().indexOf(c) != -1))) {
+					tokens.count(c, e.getValue());
+				}
+			}
+		}
+
+		// Print results
+		double t = 0.0;
+		for (char c : GALLOWS) {
+			double p = (double) tokens.getCount(c) / words.getTotalCounted();
+			if (Double.isNaN(p))
+				p = 0.0;
+			t += p;
+			System.out.print(p + ";");
+		}
+		System.out.println(t);
+	}
+}
@@ -10,6 +10,7 @@
 import io.github.mzattera.v4j.text.ivtff.IvtffLine;
 import io.github.mzattera.v4j.text.ivtff.IvtffPage;
 import io.github.mzattera.v4j.text.ivtff.IvtffText;
+import io.github.mzattera.v4j.text.ivtff.LineFilter;
 import io.github.mzattera.v4j.text.ivtff.VoynichFactory;
 import io.github.mzattera.v4j.text.ivtff.VoynichFactory.Transcription;
 import io.github.mzattera.v4j.text.ivtff.VoynichFactory.TranscriptionType;
@@ -59,6 +60,7 @@ public static void main(String[] args) {
 			System.out.println();
 
 			IvtffText doc = VoynichFactory.getDocument(TRANSCRIPTION, TRANSCRIPTION_TYPE, ALPHABET);
+			doc = doc.filterLines(LineFilter.PARAGRAPH_TEXT_FILTER);
 			if (FILTER != null)
 				doc = doc.filterPages(FILTER);
 
@@ -68,8 +70,8 @@ public static void main(String[] args) {
 				if (w.length == 0)
 					continue; // paranoid guard
 				if ((lastWord != null) && (w[0].equals(lastWord)))
-					System.out.println(l);
-				lastWord = w[w.length-1];
+					System.out.println(l.getPage().getDescriptor().getCluster()+ " -> " + l);
+				lastWord = w[w.length - 1];
 
 			} // for each line