From 236607eeae0a67735096580c4078dac19a3bbb0b Mon Sep 17 00:00:00 2001 From: Curtis Gagliardi Date: Mon, 31 Oct 2016 12:48:31 -0700 Subject: [PATCH] fix lua counting There were actually 2 bugs here: 1) problem where the multi-line pattern had the single-line pattern as a prefix (for lua: `--`, `--[[`. 2) If the the start and end of multi-line comment are different lengths, we end up calling is_character_boundary out of bounds, which returns false, causing us to skip over valid characters --- Cargo.lock | 6 +++--- src/lib.rs | 21 ++++++++++++--------- tests/count.rs | 35 +++++++++++++++++++++++++++++++++++ tests/data/lua.lua | 16 ++++++++++++++++ 4 files changed, 66 insertions(+), 12 deletions(-) create mode 100644 tests/data/lua.lua diff --git a/Cargo.lock b/Cargo.lock index d611613..d829fef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,7 +2,7 @@ name = "loc" version = "0.3.3" dependencies = [ - "clap 2.16.2 (registry+https://github.com/rust-lang/crates.io-index)", + "clap 2.16.3 (registry+https://github.com/rust-lang/crates.io-index)", "deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", "errno 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)", "itertools 0.5.5 (registry+https://github.com/rust-lang/crates.io-index)", @@ -35,7 +35,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "clap" -version = "2.16.2" +version = "2.16.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -235,7 +235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" "checksum ansi_term 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "23ac7c30002a5accbf7e8987d0632fa6de155b7c3d39d0067317a391e00a2ef6" "checksum bitflags 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "aad18937a628ec6abcd26d1489012cc0e18c21798210f491af69ded9b881106d" -"checksum clap 2.16.2 (registry+https://github.com/rust-lang/crates.io-index)" = "08aac7b078ec0a58e1d4b43cfb11d47001f8eb7c6f6f2bda4f5eed43c82491f1" +"checksum clap 2.16.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0ab91429d96eece6d6cf8a737105f0e255fea039fed86e2118ff8d3fd69601dd" "checksum deque 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1614659040e711785ed8ea24219140654da1729f3ec8a47a9719d041112fe7bf" "checksum either 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8aa2c82b7e1abd89a8a59fd89c4a51576ea76a894edf5d5b28944dd46edfed8d" "checksum errno 0.1.8 (registry+https://github.com/rust-lang/crates.io-index)" = "1e2b2decb0484e15560df3210cf0d78654bb0864b2c138977c07e377a1bae0e2" diff --git a/src/lib.rs b/src/lib.rs index aa1fe51..3eb0043 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ extern crate memmap; extern crate memchr; use std::path::Path; -use std::cmp; +use std::cmp::{min, max}; use std::fmt; use memmap::{Mmap, Protection}; @@ -40,9 +40,7 @@ pub enum LineConfig<'a> { multi_start: &'a str, multi_end: &'a str, }, - SingleOnly { - single_start: &'a str, - }, + SingleOnly { single_start: &'a str }, MultiOnly { multi_start: &'a str, multi_end: &'a str, @@ -580,6 +578,7 @@ pub fn count_single(filepath: &str, single_start: &str) -> Count { c } +// TODO(cgag): don't forget to update this when fixing the lua bug pub fn count_multi(filepath: &str, multi_start: &str, multi_end: &str) -> Count { // this is a duplicate of count_single_multi without the check for single comment. // Basically removes one branch. Probably pointless: benchmark. @@ -626,7 +625,7 @@ pub fn count_multi(filepath: &str, multi_start: &str, multi_end: &str) -> Count let mut found_code = false; 'outer: while pos < trimmed.len() { // TODO(cgag): must be a less stupid way to do this - for i in pos..(pos + cmp::max(start_len, end_len) + 1) { + for i in pos..pos + min(max(start_len, end_len) + 1, trimmed.len() - pos) { if !trimmed.is_char_boundary(i) { pos += 1; continue 'outer; @@ -638,7 +637,7 @@ pub fn count_multi(filepath: &str, multi_start: &str, multi_end: &str) -> Count pos += start_len; in_comment = true; } else if in_comment && pos + end_len <= trimmed.len() && - &trimmed[pos..(pos + end_len)] == multi_end { + &trimmed[pos..(pos + end_len)] == multi_end { pos += end_len; in_comment = false; // TODO(cgag): should we bother handling whitespace here? @@ -714,13 +713,17 @@ pub fn count_single_multi(filepath: &str, }; c.lines += 1; + let trimmed = line.trim_left(); if trimmed.is_empty() { c.blank += 1; continue; }; - if !in_comment && trimmed.starts_with(single_start) { + // TODO(cgag): Could be more efficient by only doing this third check when + // multi_start starts with the same chars as single_start, such as with lua (--, --[, ]]). + // Not sure it's necessary + if !in_comment && trimmed.starts_with(single_start) && !trimmed.starts_with(multi_start) { c.comment += 1; continue; } @@ -744,7 +747,7 @@ pub fn count_single_multi(filepath: &str, // TODO(cgag): must be a less stupid way to do this. At the // very least don't recalculate max over and over. LLVM probably // optimizes this but it seems dumb to depend on it? - for i in pos..(pos + cmp::max(start_len, end_len) + 1) { + for i in pos..pos + min(max(start_len, end_len) + 1, trimmed.len() - pos) { if !trimmed.is_char_boundary(i) { pos += 1; continue 'outer; @@ -756,7 +759,7 @@ pub fn count_single_multi(filepath: &str, pos += start_len; in_comment = true; } else if in_comment && pos + end_len <= trimmed_len && - &trimmed[pos..(pos + end_len)] == multi_end { + &trimmed[pos..(pos + end_len)] == multi_end { pos += end_len; in_comment = false; // TODO(cgag): should we bother handling whitespace here? diff --git a/tests/count.rs b/tests/count.rs index 3bf4327..cc7443b 100644 --- a/tests/count.rs +++ b/tests/count.rs @@ -11,6 +11,7 @@ const PLASMA_EXPECTED: Count = Count { }; + #[test] fn test_plasma_count() { assert_eq!(PLASMA_EXPECTED, count(PLASMA)); @@ -172,3 +173,37 @@ fn test_ipl_blank() { fn test_ipl_lines() { assert_eq!(IPL_EXPECTED.lines, count(IPL).lines); } + +// TODO(cgag): find or make a better testing tool? Or add some simple macros? +const LUA: &'static str = "tests/data/lua.lua"; +const LUA_EXPECTED: Count = Count { + code: 7, + blank: 1, + comment: 8, + lines: 7 + 8 + 1, +}; + +#[test] +fn test_lua_count() { + assert_eq!(LUA_EXPECTED, count(LUA)); +} + +#[test] +fn test_lua_code() { + assert_eq!(LUA_EXPECTED.code, count(LUA).code); +} + +#[test] +fn test_lua_comment() { + assert_eq!(LUA_EXPECTED.comment, count(LUA).comment); +} + +#[test] +fn test_lua_blank() { + assert_eq!(LUA_EXPECTED.blank, count(LUA).blank); +} + +#[test] +fn test_lua_lines() { + assert_eq!(LUA_EXPECTED.lines, count(LUA).lines); +} diff --git a/tests/data/lua.lua b/tests/data/lua.lua new file mode 100644 index 0000000..146e10f --- /dev/null +++ b/tests/data/lua.lua @@ -0,0 +1,16 @@ +--[[ This + is + a + multi-line + comment, + not + code. ]] + +-- build table +statetab = {} +local w1, w2 = NOWORD, NOWORD +for w in allwords() do + insert(prefix(w1, w2), w) + w1 = w2; w2 = w; +end +insert(prefix(w1, w2), NOWORD)