continue work on preparser
This commit is contained in:
parent
1885305283
commit
ade4a32b3e
5 changed files with 287 additions and 104 deletions
56
Cargo.lock
generated
56
Cargo.lock
generated
|
@ -13,6 +13,7 @@ name = "capybara-engine"
|
|||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"num-bigint",
|
||||
"thiserror",
|
||||
"widestring",
|
||||
]
|
||||
|
||||
|
@ -46,6 +47,61 @@ dependencies = [
|
|||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.78"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.48"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.56"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.56"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
||||
|
||||
[[package]]
|
||||
name = "widestring"
|
||||
version = "1.0.2"
|
||||
|
|
|
@ -7,4 +7,5 @@ edition = "2021"
|
|||
|
||||
[dependencies]
|
||||
num-bigint = "0.4.4"
|
||||
thiserror = "1.0.56"
|
||||
widestring = "1.0.2"
|
||||
|
|
|
@ -1,111 +1,11 @@
|
|||
use std::fmt::Write;
|
||||
mod clear_spaces;
|
||||
mod insert_semicolons;
|
||||
|
||||
// Prepares the given raw code to be consumed by the tokeniser
|
||||
// It achieves this by adding missing semicolons and adjusting whitespace (normalising to one space for everything outside of strings)
|
||||
pub fn preparse(raw: &str) -> String {
|
||||
let result = clear_spaces(raw);
|
||||
let result = clear_spaces::clear_spaces(raw);
|
||||
let result = insert_semicolons::insert_semicolons(&result);
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn clear_spaces(raw: &str) -> String {
|
||||
let mut cleared = String::new();
|
||||
|
||||
let mut currently_quoted = false;
|
||||
let mut quote_is_single = false;
|
||||
let mut inside_quoted_codeblock = false;
|
||||
let mut previous_char = 'a';
|
||||
|
||||
|
||||
for c in raw.chars() {
|
||||
if currently_quoted {
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
previous_char = c;
|
||||
} else {
|
||||
match c {
|
||||
'"' => {
|
||||
if currently_quoted && !quote_is_single && !inside_quoted_codeblock {
|
||||
currently_quoted = false;
|
||||
} else if !currently_quoted {
|
||||
currently_quoted = true;
|
||||
quote_is_single = false;
|
||||
// Just overwrite it here already in case of some weird stuff
|
||||
// Assume that only ever one codeblock will be nested in a string
|
||||
// If there's more, this code will explode
|
||||
// Not really, but spaces might get messed up
|
||||
inside_quoted_codeblock = false;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
'\'' => {
|
||||
if currently_quoted && quote_is_single {
|
||||
currently_quoted = false;
|
||||
} else {
|
||||
currently_quoted = true;
|
||||
quote_is_single = true;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
'{' => {
|
||||
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
||||
inside_quoted_codeblock = true;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
'}' => {
|
||||
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
||||
inside_quoted_codeblock = false;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
x => {
|
||||
if !(previous_char.is_whitespace() && x.is_whitespace()) {
|
||||
cleared.write_char(x.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
} else {
|
||||
}
|
||||
previous_char = x;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cleared
|
||||
}
|
||||
|
||||
// Test to make sure that multiple spaces get grouped into one
|
||||
#[test]
|
||||
fn test_clear_spaces1() {
|
||||
assert_eq!(clear_spaces(" \n \t"), " ".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that only spaces are affected and not normal text
|
||||
#[test]
|
||||
fn test_clear_spaces2() {
|
||||
let res = clear_spaces("foo bar");
|
||||
assert_eq!(res, "foo bar".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that double quoted test keeps spaces
|
||||
#[test]
|
||||
fn test_clear_spaces3() {
|
||||
assert_eq!(clear_spaces("\"foo bar\""), "\"foo bar\"".to_owned())
|
||||
}
|
||||
|
||||
// Same as previous, but for single quotes
|
||||
#[test]
|
||||
fn test_clear_spaces4() {
|
||||
assert_eq!(clear_spaces("'foo bar'"), "'foo bar'".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that code blocks inside single quotes are treated as quoted as well
|
||||
#[test]
|
||||
fn test_clear_spaces5() {
|
||||
assert_eq!(clear_spaces("'foo {shouldn't compress space} inside'"), "'foo {shouldn't compress space} inside'".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that code blocks inside double quotes are treated as non-quoted
|
||||
#[test]
|
||||
fn test_clear_spaces6() {
|
||||
assert_eq!(clear_spaces("\"foo {should compress space} inside\""), "\"foo {should compress space} inside\"".to_owned())
|
||||
}
|
221
src/parser/preparser/clear_spaces.rs
Normal file
221
src/parser/preparser/clear_spaces.rs
Normal file
|
@ -0,0 +1,221 @@
|
|||
use std::{fmt::Write, str::Chars};
|
||||
|
||||
#[derive(PartialEq)]
|
||||
enum QuoteMethod {
|
||||
Single,
|
||||
Double,
|
||||
Backtick
|
||||
}
|
||||
|
||||
// Normalised empty space in codeblocks, ensuring that all whitespace in codeblocks is one character wide
|
||||
pub fn clear_spaces(raw: &str) -> String {
|
||||
clear_in_code(&mut raw.chars())
|
||||
}
|
||||
|
||||
fn clear_in_code<'a, 'b>(raw: &mut Chars<'a>) -> String {
|
||||
let mut cleared = String::new();
|
||||
let mut last_char = 'a';
|
||||
while let Some(c) = raw.next() {
|
||||
match &c {
|
||||
'"' => {
|
||||
cleared.push('"');
|
||||
println!("Entering double quoted with current \"{}\"", &cleared);
|
||||
let res = clear_in_quotes(raw, QuoteMethod::Double);
|
||||
cleared.push_str(&res);
|
||||
last_char = '"';
|
||||
}
|
||||
'\'' => {
|
||||
cleared.push('\'');
|
||||
println!("Entering single quoted with current \"{}\"", &cleared);
|
||||
let res = clear_in_quotes(raw, QuoteMethod::Single);
|
||||
cleared.push_str(&res);
|
||||
last_char = '\'';
|
||||
}
|
||||
'`' => {
|
||||
cleared.push('`');
|
||||
println!("Entering backtick quoted with current \"{}\"", &cleared);
|
||||
let res = clear_in_quotes(raw, QuoteMethod::Backtick);
|
||||
cleared.push_str(&res);
|
||||
last_char = '`';
|
||||
}
|
||||
'{' => {
|
||||
cleared.push('{');
|
||||
println!("Entering codeblock with current \"{}\"", &cleared);
|
||||
let res = clear_in_code(raw);
|
||||
cleared.push_str(&res);
|
||||
last_char = '}';
|
||||
}
|
||||
'}' => {
|
||||
cleared.push('}');
|
||||
println!("Exiting codeblock with current \"{}\"", &cleared);
|
||||
return cleared
|
||||
}
|
||||
x => {
|
||||
if last_char.is_whitespace() && x.is_whitespace() {
|
||||
last_char = x.clone();
|
||||
continue;
|
||||
}
|
||||
cleared.push(x.clone());
|
||||
last_char = x.clone();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
cleared
|
||||
}
|
||||
|
||||
fn clear_in_quotes<'a, 'b>(raw: &mut Chars<'a>, method: QuoteMethod) -> String {
|
||||
let mut cleared = String::new();
|
||||
let mut last_char = 'a';
|
||||
while let Some(c) = raw.next() {
|
||||
match c {
|
||||
'"' => {
|
||||
cleared.push('"');
|
||||
if method == QuoteMethod::Double && last_char != '\\' {
|
||||
println!("Exiting double quoted with current \"{}\"", &cleared);
|
||||
return cleared
|
||||
}
|
||||
last_char = '"';
|
||||
}
|
||||
'\'' => {
|
||||
cleared.push('\'');
|
||||
if method == QuoteMethod::Single && last_char != '\\' {
|
||||
println!("Exiting single quoted with current \"{}\"", &cleared);
|
||||
return cleared
|
||||
}
|
||||
last_char = '\'';
|
||||
}
|
||||
'`' => {
|
||||
cleared.push('`');
|
||||
if method == QuoteMethod::Backtick && last_char != '\\' {
|
||||
println!("Exiting backtick quoted with current \"{}\"", &cleared);
|
||||
return cleared
|
||||
}
|
||||
last_char = '`';
|
||||
}
|
||||
'{' => {
|
||||
cleared.push('{');
|
||||
// Code blocks are only run inside backtick strings and with a $ before the curly braces
|
||||
if method == QuoteMethod::Backtick && last_char == '$' {
|
||||
println!("Entering codeblock with current \"{}\"", &cleared);
|
||||
let res = clear_in_code(raw);
|
||||
cleared.push_str(&res);
|
||||
last_char = '}';
|
||||
} else {
|
||||
last_char = '{';
|
||||
}
|
||||
}
|
||||
x => {
|
||||
cleared.push(x.clone());
|
||||
last_char = x.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
cleared
|
||||
}
|
||||
|
||||
fn old_clear_spaces(raw: &str) -> String {
|
||||
let mut cleared = String::new();
|
||||
|
||||
let mut currently_quoted = false;
|
||||
let mut quote_is_single = false;
|
||||
let mut inside_quoted_codeblock = false;
|
||||
let mut previous_char = 'a';
|
||||
|
||||
|
||||
for c in raw.chars() {
|
||||
if currently_quoted {
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
previous_char = c;
|
||||
} else {
|
||||
match c {
|
||||
'"' => {
|
||||
if currently_quoted && !quote_is_single && !inside_quoted_codeblock {
|
||||
currently_quoted = false;
|
||||
} else if !currently_quoted {
|
||||
currently_quoted = true;
|
||||
quote_is_single = false;
|
||||
// Just overwrite it here already in case of some weird stuff
|
||||
// Assume that only ever one codeblock will be nested in a string
|
||||
// If there's more, this code will explode
|
||||
// Not really, but spaces might get messed up
|
||||
inside_quoted_codeblock = false;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
'\'' => {
|
||||
if currently_quoted && quote_is_single {
|
||||
currently_quoted = false;
|
||||
} else {
|
||||
currently_quoted = true;
|
||||
quote_is_single = true;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
'{' => {
|
||||
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
||||
inside_quoted_codeblock = true;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
'}' => {
|
||||
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
||||
inside_quoted_codeblock = false;
|
||||
}
|
||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
}
|
||||
x => {
|
||||
if !(previous_char.is_whitespace() && x.is_whitespace()) {
|
||||
cleared.write_char(x.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||
} else {
|
||||
}
|
||||
previous_char = x;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cleared
|
||||
}
|
||||
|
||||
// Test to make sure that multiple spaces get grouped into one
|
||||
#[test]
|
||||
fn test_clear_spaces1() {
|
||||
assert_eq!(clear_spaces(" \n \t"), " ".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that only spaces are affected and not normal text
|
||||
#[test]
|
||||
fn test_clear_spaces2() {
|
||||
let res = clear_spaces("foo bar");
|
||||
assert_eq!(res, "foo bar".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that double quoted test keeps spaces
|
||||
#[test]
|
||||
fn test_clear_spaces3() {
|
||||
let res = clear_spaces("\"foo bar\"");
|
||||
assert_eq!(res, "\"foo bar\"".to_owned())
|
||||
}
|
||||
|
||||
// Same as previous, but for single quotes
|
||||
#[test]
|
||||
fn test_clear_spaces4() {
|
||||
let res = clear_spaces("'foo bar'");
|
||||
assert_eq!(res, "'foo bar'".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that code blocks inside single quotes are treated as quoted as well
|
||||
#[test]
|
||||
fn test_clear_spaces5() {
|
||||
let res = clear_spaces("pre quote'foo {shouldn\\'t compress space} inside'post quote");
|
||||
assert_eq!(res, "pre quote'foo {shouldn\\'t compress space} inside'post quote".to_owned())
|
||||
}
|
||||
|
||||
// Test to make sure that code blocks inside double quotes are treated as non-quoted
|
||||
#[test]
|
||||
fn test_clear_spaces6() {
|
||||
let res = clear_spaces("pre quote`foo ${should compress space} inside`post quote");
|
||||
assert_eq!(res, "pre quote`foo ${should compress space} inside`post quote".to_owned())
|
||||
}
|
5
src/parser/preparser/insert_semicolons.rs
Normal file
5
src/parser/preparser/insert_semicolons.rs
Normal file
|
@ -0,0 +1,5 @@
|
|||
// Takes space normalised source code and adds semicolons where needed
|
||||
// May not work properly if string isn't space normalised beforehand
|
||||
pub fn insert_semicolons(raw: &str) -> String {
|
||||
"".to_owned()
|
||||
}
|
Loading…
Reference in a new issue