Skip to content

Commit 1e2219a

Browse files
committed
0.7.35 - wordlist and replacement improvements.
1 parent 70e0b93 commit 1e2219a

13 files changed

+877
-77
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[package]
22
name = "rustrict"
33
authors = ["Finn Bear"]
4-
version = "0.7.34"
4+
version = "0.7.35"
55
edition = "2021"
66
license = "MIT OR Apache-2.0"
77
repository = "https://github.com/finnbear/rustrict/"

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected
177177

178178
| Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
179179
|-------|----------|-------------------|-------------------|------|
180-
| [rustrict](https://crates.io/crates/rustrict) | 80.00% | 93.98% | 76.52% | 9s |
180+
| [rustrict](https://crates.io/crates/rustrict) | 80.00% | 94.01% | 76.50% | 9s |
181181
| [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s |
182182
| [stfu](https://crates.io/crates/stfu) | 91.74% | 77.69% | 95.25% | 45s |
183183
| [profane-rs](https://crates.io/crates/profane-rs) | 80.47% | 73.79% | 82.14% | 52s |

src/censor.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
566566
// space.
567567
// ( and ) are for ignoring appositive phrases.
568568
// Checking node.last is to collapse multiple spaces into one
569-
let new_space = matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')')
569+
let new_space = matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')' | '_' | '-')
570570
&& m.node.last != Some(' ');
571571
let new_repetition: bool = !new_space && c == m.last;
572572
let new_skip = !new_space && skippable && !ignore_sep && !new_repetition;

src/character_analyzer.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ fn main() {
4747
if let Some(c) = char::from_u32(u) {
4848
let max_width = match c {
4949
'🐿' => 20,
50-
'𒐫' => 40,
50+
'𒐫' => 80,
5151
'𒈙' => 35,
5252
'༺' | '༻' => 25,
5353
_ => {

src/character_widths.bin

0 Bytes
Binary file not shown.

src/dictionary_extra.txt

+4
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ faggetaboutit
8787
farming xp
8888
fatty acid
8989
fatty food
90+
femboys are awesome
91+
femboys are cool
9092
few secs
9193
ffa game
9294
fire cracker
@@ -99,6 +101,7 @@ freakin
99101
fuchs dystrophy
100102
fugia
101103
gaya
104+
gg german
102105
ght, its
103106
glhf
104107
graham cracker
@@ -196,6 +199,7 @@ pc master race
196199
pegging the
197200
plss
198201
plsss
202+
plz stop
199203
plzz
200204
plzzz
201205
pocock

src/false_positives.txt

+29
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,8 @@ arco on
10491049
arco vary
10501050
arco ward
10511051
arctocephalus
1052+
are africans
1053+
are asians
10521054
areas hole
10531055
ared skins
10541056
arena holes
@@ -3490,6 +3492,7 @@ buzz ext
34903492
buzz hilt
34913493
buzz hit
34923494
buzz lut
3495+
buzz off
34933496
buzz perm
34943497
bytes cumulative
34953498
bytes ext
@@ -3529,6 +3532,7 @@ caliphate
35293532
cam girl
35303533
camel tox
35313534
campoo
3535+
can't it
35323536
canal
35333537
canberra appeal
35343538
canberra appear
@@ -5270,6 +5274,7 @@ directions lut
52705274
directions perm
52715275
directions seeks
52725276
dirty juan
5277+
dirty muslim
52735278
disco jones
52745279
disco on
52755280
disco vary
@@ -6570,6 +6575,8 @@ felt chuck
65706575
felt church
65716576
felt xhtml
65726577
females squirting
6578+
femboys are awesome
6579+
femboys are cool
65736580
fend yourself
65746581
fennig
65756582
fers cumulative
@@ -7203,6 +7210,7 @@ geyan
72037210
geyerite
72047211
geylies
72057212
geyser
7213+
gg ger
72067214
ghastful
72077215
ghettoized
72087216
ghettoizes
@@ -12187,6 +12195,8 @@ nu destin
1218712195
nu destroy
1218812196
nu destruct
1218912197
nu ger
12198+
nuke iran
12199+
nuke israel
1219012200
nurses cumulative
1219112201
nurses ext
1219212202
nurses hilt
@@ -13567,6 +13577,7 @@ plumbaginaceae
1356713577
plumbaginaceous
1356813578
plumbum
1356913579
plumigerous
13580+
plz stop
1357013581
plzz
1357113582
pmsg
1357213583
pn lips
@@ -13581,6 +13592,7 @@ pockets perm
1358113592
pockets seeks
1358213593
pocock
1358313594
pogeys
13595+
poggers
1358413596
pogonips
1358513597
points cumulative
1358613598
points ext
@@ -15285,6 +15297,20 @@ rico ward
1528515297
rid dicke
1528615298
rid licking
1528715299
rid ongoing
15300+
ride mea
15301+
ride mech
15302+
ride med
15303+
ride mee
15304+
ride meg
15305+
ride mel
15306+
ride mem
15307+
ride men
15308+
ride mer
15309+
ride mes
15310+
ride met
15311+
ride mexica
15312+
ride mexico
15313+
ride meyer
1528815314
riders cumulative
1528915315
riders ext
1529015316
riders hilt
@@ -17056,6 +17082,7 @@ spleening
1705617082
spleninii
1705717083
splice
1705817084
splicing
17085+
splix
1705917086
sponsible peer
1706017087
spoorn
1706117088
sporadic
@@ -18770,6 +18797,7 @@ twattle
1877018797
twattling
1877118798
tweenies
1877218799
tweesht
18800+
tweezer
1877318801
twigger
1877418802
twilit
1877518803
twilt
@@ -19447,6 +19475,7 @@ wan kr
1944719475
wan kurt
1944819476
wan kuwait
1944919477
wan ky
19478+
wang ker
1945019479
wantwit
1945119480
wap anti
1945219481
wap peru

src/lib.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,10 @@ pub fn is_whitespace(c: char) -> bool {
8888
// https://www.compart.com/en/unicode/U+FFA0
8989
c.is_whitespace()
9090
|| c.is_other()
91+
|| c.is_format()
9192
|| matches!(
9293
c,
93-
'\u{115F}' | '\u{1160}' | '\u{2800}' | '\u{3164}' | '\u{FFA0}'
94+
'\u{115F}' | '\u{1160}' | '\u{2800}' | '\u{3164}' | '\u{FFA0}' | '\u{FFFC}'
9495
)
9596
}
9697

@@ -113,7 +114,7 @@ mod tests {
113114
// Special cases.
114115
assert_eq!(
115116
crate::trim_whitespace(
116-
"\u{0488}\u{1160}\u{0489}\u{1160}\u{0488}\u{1160}\u{0489}abc\u{0488}\u{0489}"
117+
"\u{FFF9}\u{FFFA}\u{FFFB}\u{FFFC}\u{0488}\u{1160}\u{0489}\u{1160}\u{0488}\u{1160}\u{0489}abc\u{0488}\u{0489}"
117118
),
118119
"abc\u{0488}\u{0489}"
119120
)

0 commit comments

Comments
 (0)