continue work on preparser
This commit is contained in:
parent
1885305283
commit
ade4a32b3e
5 changed files with 287 additions and 104 deletions
56
Cargo.lock
generated
56
Cargo.lock
generated
|
@ -13,6 +13,7 @@ name = "capybara-engine"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"num-bigint",
|
"num-bigint",
|
||||||
|
"thiserror",
|
||||||
"widestring",
|
"widestring",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -46,6 +47,61 @@ dependencies = [
|
||||||
"autocfg",
|
"autocfg",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.78"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.35"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "2.0.48"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thiserror"
|
||||||
|
version = "1.0.56"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
|
||||||
|
dependencies = [
|
||||||
|
"thiserror-impl",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thiserror-impl"
|
||||||
|
version = "1.0.56"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-ident"
|
||||||
|
version = "1.0.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "widestring"
|
name = "widestring"
|
||||||
version = "1.0.2"
|
version = "1.0.2"
|
||||||
|
|
|
@ -7,4 +7,5 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
num-bigint = "0.4.4"
|
num-bigint = "0.4.4"
|
||||||
|
thiserror = "1.0.56"
|
||||||
widestring = "1.0.2"
|
widestring = "1.0.2"
|
||||||
|
|
|
@ -1,111 +1,11 @@
|
||||||
use std::fmt::Write;
|
mod clear_spaces;
|
||||||
|
mod insert_semicolons;
|
||||||
|
|
||||||
// Prepares the given raw code to be consumed by the tokeniser
|
// Prepares the given raw code to be consumed by the tokeniser
|
||||||
// It achieves this by adding missing semicolons and adjusting whitespace (normalising to one space for everything outside of strings)
|
// It achieves this by adding missing semicolons and adjusting whitespace (normalising to one space for everything outside of strings)
|
||||||
pub fn preparse(raw: &str) -> String {
|
pub fn preparse(raw: &str) -> String {
|
||||||
let result = clear_spaces(raw);
|
let result = clear_spaces::clear_spaces(raw);
|
||||||
|
let result = insert_semicolons::insert_semicolons(&result);
|
||||||
|
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
fn clear_spaces(raw: &str) -> String {
|
|
||||||
let mut cleared = String::new();
|
|
||||||
|
|
||||||
let mut currently_quoted = false;
|
|
||||||
let mut quote_is_single = false;
|
|
||||||
let mut inside_quoted_codeblock = false;
|
|
||||||
let mut previous_char = 'a';
|
|
||||||
|
|
||||||
|
|
||||||
for c in raw.chars() {
|
|
||||||
if currently_quoted {
|
|
||||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
|
||||||
previous_char = c;
|
|
||||||
} else {
|
|
||||||
match c {
|
|
||||||
'"' => {
|
|
||||||
if currently_quoted && !quote_is_single && !inside_quoted_codeblock {
|
|
||||||
currently_quoted = false;
|
|
||||||
} else if !currently_quoted {
|
|
||||||
currently_quoted = true;
|
|
||||||
quote_is_single = false;
|
|
||||||
// Just overwrite it here already in case of some weird stuff
|
|
||||||
// Assume that only ever one codeblock will be nested in a string
|
|
||||||
// If there's more, this code will explode
|
|
||||||
// Not really, but spaces might get messed up
|
|
||||||
inside_quoted_codeblock = false;
|
|
||||||
}
|
|
||||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
|
||||||
}
|
|
||||||
'\'' => {
|
|
||||||
if currently_quoted && quote_is_single {
|
|
||||||
currently_quoted = false;
|
|
||||||
} else {
|
|
||||||
currently_quoted = true;
|
|
||||||
quote_is_single = true;
|
|
||||||
}
|
|
||||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
|
||||||
}
|
|
||||||
'{' => {
|
|
||||||
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
|
||||||
inside_quoted_codeblock = true;
|
|
||||||
}
|
|
||||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
|
||||||
}
|
|
||||||
'}' => {
|
|
||||||
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
|
||||||
inside_quoted_codeblock = false;
|
|
||||||
}
|
|
||||||
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
|
||||||
}
|
|
||||||
x => {
|
|
||||||
if !(previous_char.is_whitespace() && x.is_whitespace()) {
|
|
||||||
cleared.write_char(x.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
|
||||||
} else {
|
|
||||||
}
|
|
||||||
previous_char = x;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cleared
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test to make sure that multiple spaces get grouped into one
|
|
||||||
#[test]
|
|
||||||
fn test_clear_spaces1() {
|
|
||||||
assert_eq!(clear_spaces(" \n \t"), " ".to_owned())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test to make sure that only spaces are affected and not normal text
|
|
||||||
#[test]
|
|
||||||
fn test_clear_spaces2() {
|
|
||||||
let res = clear_spaces("foo bar");
|
|
||||||
assert_eq!(res, "foo bar".to_owned())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test to make sure that double quoted test keeps spaces
|
|
||||||
#[test]
|
|
||||||
fn test_clear_spaces3() {
|
|
||||||
assert_eq!(clear_spaces("\"foo bar\""), "\"foo bar\"".to_owned())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Same as previous, but for single quotes
|
|
||||||
#[test]
|
|
||||||
fn test_clear_spaces4() {
|
|
||||||
assert_eq!(clear_spaces("'foo bar'"), "'foo bar'".to_owned())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test to make sure that code blocks inside single quotes are treated as quoted as well
|
|
||||||
#[test]
|
|
||||||
fn test_clear_spaces5() {
|
|
||||||
assert_eq!(clear_spaces("'foo {shouldn't compress space} inside'"), "'foo {shouldn't compress space} inside'".to_owned())
|
|
||||||
}
|
|
||||||
|
|
||||||
// Test to make sure that code blocks inside double quotes are treated as non-quoted
|
|
||||||
#[test]
|
|
||||||
fn test_clear_spaces6() {
|
|
||||||
assert_eq!(clear_spaces("\"foo {should compress space} inside\""), "\"foo {should compress space} inside\"".to_owned())
|
|
||||||
}
|
|
221
src/parser/preparser/clear_spaces.rs
Normal file
221
src/parser/preparser/clear_spaces.rs
Normal file
|
@ -0,0 +1,221 @@
|
||||||
|
use std::{fmt::Write, str::Chars};
|
||||||
|
|
||||||
|
#[derive(PartialEq)]
|
||||||
|
enum QuoteMethod {
|
||||||
|
Single,
|
||||||
|
Double,
|
||||||
|
Backtick
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalised empty space in codeblocks, ensuring that all whitespace in codeblocks is one character wide
|
||||||
|
pub fn clear_spaces(raw: &str) -> String {
|
||||||
|
clear_in_code(&mut raw.chars())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn clear_in_code<'a, 'b>(raw: &mut Chars<'a>) -> String {
|
||||||
|
let mut cleared = String::new();
|
||||||
|
let mut last_char = 'a';
|
||||||
|
while let Some(c) = raw.next() {
|
||||||
|
match &c {
|
||||||
|
'"' => {
|
||||||
|
cleared.push('"');
|
||||||
|
println!("Entering double quoted with current \"{}\"", &cleared);
|
||||||
|
let res = clear_in_quotes(raw, QuoteMethod::Double);
|
||||||
|
cleared.push_str(&res);
|
||||||
|
last_char = '"';
|
||||||
|
}
|
||||||
|
'\'' => {
|
||||||
|
cleared.push('\'');
|
||||||
|
println!("Entering single quoted with current \"{}\"", &cleared);
|
||||||
|
let res = clear_in_quotes(raw, QuoteMethod::Single);
|
||||||
|
cleared.push_str(&res);
|
||||||
|
last_char = '\'';
|
||||||
|
}
|
||||||
|
'`' => {
|
||||||
|
cleared.push('`');
|
||||||
|
println!("Entering backtick quoted with current \"{}\"", &cleared);
|
||||||
|
let res = clear_in_quotes(raw, QuoteMethod::Backtick);
|
||||||
|
cleared.push_str(&res);
|
||||||
|
last_char = '`';
|
||||||
|
}
|
||||||
|
'{' => {
|
||||||
|
cleared.push('{');
|
||||||
|
println!("Entering codeblock with current \"{}\"", &cleared);
|
||||||
|
let res = clear_in_code(raw);
|
||||||
|
cleared.push_str(&res);
|
||||||
|
last_char = '}';
|
||||||
|
}
|
||||||
|
'}' => {
|
||||||
|
cleared.push('}');
|
||||||
|
println!("Exiting codeblock with current \"{}\"", &cleared);
|
||||||
|
return cleared
|
||||||
|
}
|
||||||
|
x => {
|
||||||
|
if last_char.is_whitespace() && x.is_whitespace() {
|
||||||
|
last_char = x.clone();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
cleared.push(x.clone());
|
||||||
|
last_char = x.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
cleared
|
||||||
|
}
|
||||||
|
|
||||||
|
fn clear_in_quotes<'a, 'b>(raw: &mut Chars<'a>, method: QuoteMethod) -> String {
|
||||||
|
let mut cleared = String::new();
|
||||||
|
let mut last_char = 'a';
|
||||||
|
while let Some(c) = raw.next() {
|
||||||
|
match c {
|
||||||
|
'"' => {
|
||||||
|
cleared.push('"');
|
||||||
|
if method == QuoteMethod::Double && last_char != '\\' {
|
||||||
|
println!("Exiting double quoted with current \"{}\"", &cleared);
|
||||||
|
return cleared
|
||||||
|
}
|
||||||
|
last_char = '"';
|
||||||
|
}
|
||||||
|
'\'' => {
|
||||||
|
cleared.push('\'');
|
||||||
|
if method == QuoteMethod::Single && last_char != '\\' {
|
||||||
|
println!("Exiting single quoted with current \"{}\"", &cleared);
|
||||||
|
return cleared
|
||||||
|
}
|
||||||
|
last_char = '\'';
|
||||||
|
}
|
||||||
|
'`' => {
|
||||||
|
cleared.push('`');
|
||||||
|
if method == QuoteMethod::Backtick && last_char != '\\' {
|
||||||
|
println!("Exiting backtick quoted with current \"{}\"", &cleared);
|
||||||
|
return cleared
|
||||||
|
}
|
||||||
|
last_char = '`';
|
||||||
|
}
|
||||||
|
'{' => {
|
||||||
|
cleared.push('{');
|
||||||
|
// Code blocks are only run inside backtick strings and with a $ before the curly braces
|
||||||
|
if method == QuoteMethod::Backtick && last_char == '$' {
|
||||||
|
println!("Entering codeblock with current \"{}\"", &cleared);
|
||||||
|
let res = clear_in_code(raw);
|
||||||
|
cleared.push_str(&res);
|
||||||
|
last_char = '}';
|
||||||
|
} else {
|
||||||
|
last_char = '{';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
x => {
|
||||||
|
cleared.push(x.clone());
|
||||||
|
last_char = x.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cleared
|
||||||
|
}
|
||||||
|
|
||||||
|
fn old_clear_spaces(raw: &str) -> String {
|
||||||
|
let mut cleared = String::new();
|
||||||
|
|
||||||
|
let mut currently_quoted = false;
|
||||||
|
let mut quote_is_single = false;
|
||||||
|
let mut inside_quoted_codeblock = false;
|
||||||
|
let mut previous_char = 'a';
|
||||||
|
|
||||||
|
|
||||||
|
for c in raw.chars() {
|
||||||
|
if currently_quoted {
|
||||||
|
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||||
|
previous_char = c;
|
||||||
|
} else {
|
||||||
|
match c {
|
||||||
|
'"' => {
|
||||||
|
if currently_quoted && !quote_is_single && !inside_quoted_codeblock {
|
||||||
|
currently_quoted = false;
|
||||||
|
} else if !currently_quoted {
|
||||||
|
currently_quoted = true;
|
||||||
|
quote_is_single = false;
|
||||||
|
// Just overwrite it here already in case of some weird stuff
|
||||||
|
// Assume that only ever one codeblock will be nested in a string
|
||||||
|
// If there's more, this code will explode
|
||||||
|
// Not really, but spaces might get messed up
|
||||||
|
inside_quoted_codeblock = false;
|
||||||
|
}
|
||||||
|
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||||
|
}
|
||||||
|
'\'' => {
|
||||||
|
if currently_quoted && quote_is_single {
|
||||||
|
currently_quoted = false;
|
||||||
|
} else {
|
||||||
|
currently_quoted = true;
|
||||||
|
quote_is_single = true;
|
||||||
|
}
|
||||||
|
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||||
|
}
|
||||||
|
'{' => {
|
||||||
|
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
||||||
|
inside_quoted_codeblock = true;
|
||||||
|
}
|
||||||
|
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||||
|
}
|
||||||
|
'}' => {
|
||||||
|
if currently_quoted && !inside_quoted_codeblock && !quote_is_single {
|
||||||
|
inside_quoted_codeblock = false;
|
||||||
|
}
|
||||||
|
cleared.write_char(c.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||||
|
}
|
||||||
|
x => {
|
||||||
|
if !(previous_char.is_whitespace() && x.is_whitespace()) {
|
||||||
|
cleared.write_char(x.clone()).expect("Failed to write character to out string during whitespace normalisation");
|
||||||
|
} else {
|
||||||
|
}
|
||||||
|
previous_char = x;
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cleared
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure that multiple spaces get grouped into one
|
||||||
|
#[test]
|
||||||
|
fn test_clear_spaces1() {
|
||||||
|
assert_eq!(clear_spaces(" \n \t"), " ".to_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure that only spaces are affected and not normal text
|
||||||
|
#[test]
|
||||||
|
fn test_clear_spaces2() {
|
||||||
|
let res = clear_spaces("foo bar");
|
||||||
|
assert_eq!(res, "foo bar".to_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure that double quoted test keeps spaces
|
||||||
|
#[test]
|
||||||
|
fn test_clear_spaces3() {
|
||||||
|
let res = clear_spaces("\"foo bar\"");
|
||||||
|
assert_eq!(res, "\"foo bar\"".to_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Same as previous, but for single quotes
|
||||||
|
#[test]
|
||||||
|
fn test_clear_spaces4() {
|
||||||
|
let res = clear_spaces("'foo bar'");
|
||||||
|
assert_eq!(res, "'foo bar'".to_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure that code blocks inside single quotes are treated as quoted as well
|
||||||
|
#[test]
|
||||||
|
fn test_clear_spaces5() {
|
||||||
|
let res = clear_spaces("pre quote'foo {shouldn\\'t compress space} inside'post quote");
|
||||||
|
assert_eq!(res, "pre quote'foo {shouldn\\'t compress space} inside'post quote".to_owned())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to make sure that code blocks inside double quotes are treated as non-quoted
|
||||||
|
#[test]
|
||||||
|
fn test_clear_spaces6() {
|
||||||
|
let res = clear_spaces("pre quote`foo ${should compress space} inside`post quote");
|
||||||
|
assert_eq!(res, "pre quote`foo ${should compress space} inside`post quote".to_owned())
|
||||||
|
}
|
5
src/parser/preparser/insert_semicolons.rs
Normal file
5
src/parser/preparser/insert_semicolons.rs
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
// Takes space normalised source code and adds semicolons where needed
|
||||||
|
// May not work properly if string isn't space normalised beforehand
|
||||||
|
pub fn insert_semicolons(raw: &str) -> String {
|
||||||
|
"".to_owned()
|
||||||
|
}
|
Loading…
Reference in a new issue