continue work on preparser

This commit is contained in:
mStar aka a person 2024-02-21 14:53:59 +01:00
parent 1885305283
commit ade4a32b3e
5 changed files with 287 additions and 104 deletions

56
Cargo.lock generated
View file

@ -13,6 +13,7 @@ name = "capybara-engine"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"num-bigint", "num-bigint",
"thiserror",
"widestring", "widestring",
] ]
@ -46,6 +47,61 @@ dependencies = [
"autocfg", "autocfg",
] ]
[[package]]
name = "proc-macro2"
version = "1.0.78"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
dependencies = [
"proc-macro2",
]
[[package]]
name = "syn"
version = "2.0.48"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]] [[package]]
name = "widestring" name = "widestring"
version = "1.0.2" version = "1.0.2"

View file

@ -7,4 +7,5 @@ edition = "2021"
[dependencies] [dependencies]
num-bigint = "0.4.4" num-bigint = "0.4.4"
thiserror = "1.0.56"
widestring = "1.0.2" widestring = "1.0.2"

View file

@ -1,111 +1,11 @@
use std::fmt::Write; mod clear_spaces;
mod insert_semicolons;
// Prepares the given raw code to be consumed by the tokeniser // Prepares the given raw code to be consumed by the tokeniser
// It achieves this by adding missing semicolons and adjusting whitespace (normalising to one space for everything outside of strings) // It achieves this by adding missing semicolons and adjusting whitespace (normalising to one space for everything outside of strings)
pub fn preparse(raw: &str) -> String { pub fn preparse(raw: &str) -> String {
let result = clear_spaces(raw); let result = clear_spaces::clear_spaces(raw);
let result = insert_semicolons::insert_semicolons(&result);
result result
} }
fn clear_spaces(raw: &str) -> String {
let mut cleared = String::new();
let mut currently_quoted = false;
let mut quote_is_single = false;
let mut inside_quoted_codeblock = false;
let mut previous_char = 'a';
for c in raw.chars() {
if currently_quoted {
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
previous_char = c;
} else {
match c {
'"' => {
if currently_quoted && !quote_is_single && !inside_quoted_codeblock {
currently_quoted = false;
} else if !currently_quoted {
currently_quoted = true;
quote_is_single = false;
// Just overwrite it here already in case of some weird stuff
// Assume that only ever one codeblock will be nested in a string
// If there's more, this code will explode
// Not really, but spaces might get messed up
inside_quoted_codeblock = false;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
'\'' => {
if currently_quoted && quote_is_single {
currently_quoted = false;
} else {
currently_quoted = true;
quote_is_single = true;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
'{' => {
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
inside_quoted_codeblock = true;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
'}' => {
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
inside_quoted_codeblock = false;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
x => {
if !(previous_char.is_whitespace() && x.is_whitespace()) {
cleared.write_char(x.clone()).expect("Failed to write character to out string during whitespace normalisation");
} else {
}
previous_char = x;
}
}
}
}
cleared
}
// Test to make sure that multiple spaces get grouped into one
#[test]
fn test_clear_spaces1() {
assert_eq!(clear_spaces(" \n \t"), " ".to_owned())
}
// Test to make sure that only spaces are affected and not normal text
#[test]
fn test_clear_spaces2() {
let res = clear_spaces("foo bar");
assert_eq!(res, "foo bar".to_owned())
}
// Test to make sure that double quoted test keeps spaces
#[test]
fn test_clear_spaces3() {
assert_eq!(clear_spaces("\"foo bar\""), "\"foo bar\"".to_owned())
}
// Same as previous, but for single quotes
#[test]
fn test_clear_spaces4() {
assert_eq!(clear_spaces("'foo bar'"), "'foo bar'".to_owned())
}
// Test to make sure that code blocks inside single quotes are treated as quoted as well
#[test]
fn test_clear_spaces5() {
assert_eq!(clear_spaces("'foo {shouldn't compress space} inside'"), "'foo {shouldn't compress space} inside'".to_owned())
}
// Test to make sure that code blocks inside double quotes are treated as non-quoted
#[test]
fn test_clear_spaces6() {
assert_eq!(clear_spaces("\"foo {should compress space} inside\""), "\"foo {should compress space} inside\"".to_owned())
}

View file

@ -0,0 +1,221 @@
use std::{fmt::Write, str::Chars};
#[derive(PartialEq)]
enum QuoteMethod {
Single,
Double,
Backtick
}
// Normalised empty space in codeblocks, ensuring that all whitespace in codeblocks is one character wide
pub fn clear_spaces(raw: &str) -> String {
clear_in_code(&mut raw.chars())
}
fn clear_in_code<'a, 'b>(raw: &mut Chars<'a>) -> String {
let mut cleared = String::new();
let mut last_char = 'a';
while let Some(c) = raw.next() {
match &c {
'"' => {
cleared.push('"');
println!("Entering double quoted with current \"{}\"", &cleared);
let res = clear_in_quotes(raw, QuoteMethod::Double);
cleared.push_str(&res);
last_char = '"';
}
'\'' => {
cleared.push('\'');
println!("Entering single quoted with current \"{}\"", &cleared);
let res = clear_in_quotes(raw, QuoteMethod::Single);
cleared.push_str(&res);
last_char = '\'';
}
'`' => {
cleared.push('`');
println!("Entering backtick quoted with current \"{}\"", &cleared);
let res = clear_in_quotes(raw, QuoteMethod::Backtick);
cleared.push_str(&res);
last_char = '`';
}
'{' => {
cleared.push('{');
println!("Entering codeblock with current \"{}\"", &cleared);
let res = clear_in_code(raw);
cleared.push_str(&res);
last_char = '}';
}
'}' => {
cleared.push('}');
println!("Exiting codeblock with current \"{}\"", &cleared);
return cleared
}
x => {
if last_char.is_whitespace() && x.is_whitespace() {
last_char = x.clone();
continue;
}
cleared.push(x.clone());
last_char = x.clone();
}
}
};
cleared
}
fn clear_in_quotes<'a, 'b>(raw: &mut Chars<'a>, method: QuoteMethod) -> String {
let mut cleared = String::new();
let mut last_char = 'a';
while let Some(c) = raw.next() {
match c {
'"' => {
cleared.push('"');
if method == QuoteMethod::Double && last_char != '\\' {
println!("Exiting double quoted with current \"{}\"", &cleared);
return cleared
}
last_char = '"';
}
'\'' => {
cleared.push('\'');
if method == QuoteMethod::Single && last_char != '\\' {
println!("Exiting single quoted with current \"{}\"", &cleared);
return cleared
}
last_char = '\'';
}
'`' => {
cleared.push('`');
if method == QuoteMethod::Backtick && last_char != '\\' {
println!("Exiting backtick quoted with current \"{}\"", &cleared);
return cleared
}
last_char = '`';
}
'{' => {
cleared.push('{');
// Code blocks are only run inside backtick strings and with a $ before the curly braces
if method == QuoteMethod::Backtick && last_char == '$' {
println!("Entering codeblock with current \"{}\"", &cleared);
let res = clear_in_code(raw);
cleared.push_str(&res);
last_char = '}';
} else {
last_char = '{';
}
}
x => {
cleared.push(x.clone());
last_char = x.clone();
}
}
}
cleared
}
fn old_clear_spaces(raw: &str) -> String {
let mut cleared = String::new();
let mut currently_quoted = false;
let mut quote_is_single = false;
let mut inside_quoted_codeblock = false;
let mut previous_char = 'a';
for c in raw.chars() {
if currently_quoted {
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
previous_char = c;
} else {
match c {
'"' => {
if currently_quoted && !quote_is_single && !inside_quoted_codeblock {
currently_quoted = false;
} else if !currently_quoted {
currently_quoted = true;
quote_is_single = false;
// Just overwrite it here already in case of some weird stuff
// Assume that only ever one codeblock will be nested in a string
// If there's more, this code will explode
// Not really, but spaces might get messed up
inside_quoted_codeblock = false;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
'\'' => {
if currently_quoted && quote_is_single {
currently_quoted = false;
} else {
currently_quoted = true;
quote_is_single = true;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
'{' => {
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
inside_quoted_codeblock = true;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
'}' => {
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
inside_quoted_codeblock = false;
}
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
}
x => {
if !(previous_char.is_whitespace() && x.is_whitespace()) {
cleared.write_char(x.clone()).expect("Failed to write character to out string during whitespace normalisation");
} else {
}
previous_char = x;
}
}
}
}
cleared
}
// Test to make sure that multiple spaces get grouped into one
#[test]
fn test_clear_spaces1() {
assert_eq!(clear_spaces(" \n \t"), " ".to_owned())
}
// Test to make sure that only spaces are affected and not normal text
#[test]
fn test_clear_spaces2() {
let res = clear_spaces("foo bar");
assert_eq!(res, "foo bar".to_owned())
}
// Test to make sure that double quoted test keeps spaces
#[test]
fn test_clear_spaces3() {
let res = clear_spaces("\"foo bar\"");
assert_eq!(res, "\"foo bar\"".to_owned())
}
// Same as previous, but for single quotes
#[test]
fn test_clear_spaces4() {
let res = clear_spaces("'foo bar'");
assert_eq!(res, "'foo bar'".to_owned())
}
// Test to make sure that code blocks inside single quotes are treated as quoted as well
#[test]
fn test_clear_spaces5() {
let res = clear_spaces("pre quote'foo {shouldn\\'t compress space} inside'post quote");
assert_eq!(res, "pre quote'foo {shouldn\\'t compress space} inside'post quote".to_owned())
}
// Test to make sure that code blocks inside double quotes are treated as non-quoted
#[test]
fn test_clear_spaces6() {
let res = clear_spaces("pre quote`foo ${should compress space} inside`post quote");
assert_eq!(res, "pre quote`foo ${should compress space} inside`post quote".to_owned())
}

View file

@ -0,0 +1,5 @@
// Takes space normalised source code and adds semicolons where needed
// May not work properly if string isn't space normalised beforehand
pub fn insert_semicolons(raw: &str) -> String {
"".to_owned()
}