From 236607eeae0a67735096580c4078dac19a3bbb0b Mon Sep 17 00:00:00 2001
From: Curtis Gagliardi <curtis@curtis.io>
Date: Mon, 31 Oct 2016 12:48:31 -0700
Subject: [PATCH] fix lua counting

There were actually 2 bugs here:

1) problem where the multi-line pattern had the single-line
   pattern as a prefix (for lua: `--`, `--[[`.
2) If the the start and end of multi-line comment are different lengths,
   we end up calling is_character_boundary out of bounds, which returns
   false, causing us to skip over valid characters
---
 Cargo.lock         |  6 +++---
 src/lib.rs         | 21 ++++++++++++---------
 tests/count.rs     | 35 +++++++++++++++++++++++++++++++++++
 tests/data/lua.lua | 16 ++++++++++++++++
 4 files changed, 66 insertions(+), 12 deletions(-)
 create mode 100644 tests/data/lua.lua

diff --git a/Cargo.lock b/Cargo.lock
index d611613..d829fef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,7 +2,7 @@
 name = "loc"
 version = "0.3.3"
 dependencies = [
- "clap 2.16.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "clap 2.16.3 (registry+https://github.com/rust-lang/crates.io-index)",
  "deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "errno 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)",
  "itertools 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -35,7 +35,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
 name = "clap"
-version = "2.16.2"
+version = "2.16.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -235,7 +235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66"
 "checksum ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "23ac7c30002a5accbf7e8987d0632fa6de155b7c3d39d0067317a391e00a2ef6"
 "checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d"
-"checksum clap 2.16.2 (registry+https://github.com/rust-lang/crates.io-index)" = "08aac7b078ec0a58e1d4b43cfb11d47001f8eb7c6f6f2bda4f5eed43c82491f1"
+"checksum clap 2.16.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0ab91429d96eece6d6cf8a737105f0e255fea039fed86e2118ff8d3fd69601dd"
 "checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf"
 "checksum either 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8aa2c82b7e1abd89a8a59fd89c4a51576ea76a894edf5d5b28944dd46edfed8d"
 "checksum errno 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1e2b2decb0484e15560df3210cf0d78654bb0864b2c138977c07e377a1bae0e2"
diff --git a/src/lib.rs b/src/lib.rs
index aa1fe51..3eb0043 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -5,7 +5,7 @@ extern crate memmap;
 extern crate memchr;
 
 use std::path::Path;
-use std::cmp;
+use std::cmp::{min, max};
 use std::fmt;
 
 use memmap::{Mmap, Protection};
@@ -40,9 +40,7 @@ pub enum LineConfig<'a> {
         multi_start: &'a str,
         multi_end: &'a str,
     },
-    SingleOnly {
-        single_start: &'a str,
-    },
+    SingleOnly { single_start: &'a str },
     MultiOnly {
         multi_start: &'a str,
         multi_end: &'a str,
@@ -580,6 +578,7 @@ pub fn count_single(filepath: &str, single_start: &str) -> Count {
     c
 }
 
+// TODO(cgag): don't forget to update this when fixing the lua bug
 pub fn count_multi(filepath: &str, multi_start: &str, multi_end: &str) -> Count {
     // this is a duplicate of count_single_multi without the check for single comment.
     // Basically removes one branch.  Probably pointless: benchmark.
@@ -626,7 +625,7 @@ pub fn count_multi(filepath: &str, multi_start: &str, multi_end: &str) -> Count
         let mut found_code = false;
         'outer: while pos < trimmed.len() {
             // TODO(cgag): must be a less stupid way to do this
-            for i in pos..(pos + cmp::max(start_len, end_len) + 1) {
+            for i in pos..pos + min(max(start_len, end_len) + 1, trimmed.len() - pos) {
                 if !trimmed.is_char_boundary(i) {
                     pos += 1;
                     continue 'outer;
@@ -638,7 +637,7 @@ pub fn count_multi(filepath: &str, multi_start: &str, multi_end: &str) -> Count
                 pos += start_len;
                 in_comment = true;
             } else if in_comment && pos + end_len <= trimmed.len() &&
-               &trimmed[pos..(pos + end_len)] == multi_end {
+                      &trimmed[pos..(pos + end_len)] == multi_end {
                 pos += end_len;
                 in_comment = false;
                 // TODO(cgag): should we bother handling whitespace here?
@@ -714,13 +713,17 @@ pub fn count_single_multi(filepath: &str,
         };
         c.lines += 1;
 
+
         let trimmed = line.trim_left();
         if trimmed.is_empty() {
             c.blank += 1;
             continue;
         };
 
-        if !in_comment && trimmed.starts_with(single_start) {
+        // TODO(cgag): Could be more efficient by only doing this third check when
+        // multi_start starts with the same chars as single_start, such as with lua (--, --[, ]]).
+        // Not sure it's necessary
+        if !in_comment && trimmed.starts_with(single_start) && !trimmed.starts_with(multi_start) {
             c.comment += 1;
             continue;
         }
@@ -744,7 +747,7 @@ pub fn count_single_multi(filepath: &str,
             // TODO(cgag): must be a less stupid way to do this.  At the
             // very least don't recalculate max over and over.  LLVM probably
             // optimizes this but it seems dumb to depend on it?
-            for i in pos..(pos + cmp::max(start_len, end_len) + 1) {
+            for i in pos..pos + min(max(start_len, end_len) + 1, trimmed.len() - pos) {
                 if !trimmed.is_char_boundary(i) {
                     pos += 1;
                     continue 'outer;
@@ -756,7 +759,7 @@ pub fn count_single_multi(filepath: &str,
                 pos += start_len;
                 in_comment = true;
             } else if in_comment && pos + end_len <= trimmed_len &&
-               &trimmed[pos..(pos + end_len)] == multi_end {
+                      &trimmed[pos..(pos + end_len)] == multi_end {
                 pos += end_len;
                 in_comment = false;
                 // TODO(cgag): should we bother handling whitespace here?
diff --git a/tests/count.rs b/tests/count.rs
index 3bf4327..cc7443b 100644
--- a/tests/count.rs
+++ b/tests/count.rs
@@ -11,6 +11,7 @@ const PLASMA_EXPECTED: Count = Count {
 };
 
 
+
 #[test]
 fn test_plasma_count() {
     assert_eq!(PLASMA_EXPECTED, count(PLASMA));
@@ -172,3 +173,37 @@ fn test_ipl_blank() {
 fn test_ipl_lines() {
     assert_eq!(IPL_EXPECTED.lines, count(IPL).lines);
 }
+
+// TODO(cgag): find or make a better testing tool? Or add some simple macros?
+const LUA: &'static str = "tests/data/lua.lua";
+const LUA_EXPECTED: Count = Count {
+    code: 7,
+    blank: 1,
+    comment: 8,
+    lines: 7 + 8 + 1,
+};
+
+#[test]
+fn test_lua_count() {
+    assert_eq!(LUA_EXPECTED, count(LUA));
+}
+
+#[test]
+fn test_lua_code() {
+    assert_eq!(LUA_EXPECTED.code, count(LUA).code);
+}
+
+#[test]
+fn test_lua_comment() {
+    assert_eq!(LUA_EXPECTED.comment, count(LUA).comment);
+}
+
+#[test]
+fn test_lua_blank() {
+    assert_eq!(LUA_EXPECTED.blank, count(LUA).blank);
+}
+
+#[test]
+fn test_lua_lines() {
+    assert_eq!(LUA_EXPECTED.lines, count(LUA).lines);
+}
diff --git a/tests/data/lua.lua b/tests/data/lua.lua
new file mode 100644
index 0000000..146e10f
--- /dev/null
+++ b/tests/data/lua.lua
@@ -0,0 +1,16 @@
+--[[ This
+     is
+     a
+     multi-line
+     comment,
+     not
+     code. ]]
+
+-- build table
+statetab = {}
+local w1, w2 = NOWORD, NOWORD
+for w in allwords() do
+  insert(prefix(w1, w2), w)
+  w1 = w2; w2 = w;
+end
+insert(prefix(w1, w2), NOWORD)