Files
fennec/include/fennec/interpret/tokenizer.h

113 lines
3.4 KiB
C++

// =====================================================================================================================
// fennec, a free and open source game engine
// Copyright © 2025 Medusa Slockbower
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
// =====================================================================================================================
///
/// \file tokenizer.h
/// \brief
///
///
/// \details
/// \author Medusa Slockbower
///
/// \copyright Copyright © 2025 Medusa Slockbower ([GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html))
///
///
#ifndef FENNEC_INTERPRET_TOKENIZER_H
#define FENNEC_INTERPRET_TOKENIZER_H
#include <fennec/containers/list.h>
#include <fennec/containers/map.h>
#include <fennec/containers/priority_queue.h>
#include <fennec/string/string.h>
//
// escape sequences are tricky, sometimes they must be separated by white space,
// other times they don't. Requiring a list of all possible escape sequences is unrealistic.
// We need to allow the user of this struct to specify rules for escape sequences. Here are some basic rules:
//
// An escape sequence is marked by an escape character, e.g. %, \, {{
// Multiple escape characters may be used in a single tokenizer and will have different rules
// Escape characters may also be operators, brackets, or quotes
// Escape sequences may contain operators, brackets, or quotes
//
// Here are a few examples of escape sequences from various formats and languages
// C: \\, \n, \0, \u200b
// PrintF: %s, %2.2f
// Python FMT: {{, }}
// SPSS: ''
//
namespace fennec
{
struct escape_sequence {
virtual size_t operator[](const string& str, size_t i) = 0;
};
struct tokenizer {
using escseq = escape_sequence*;
using escmap = map<char, escape_sequence*>;
string delimiter; // markers that separate tokens
string operators; // operators are treated as individual tokens
string brackets; // characters that mark brackets
string quotes; // characters that mark a string sequence, entire string sequence is treated as one token
escmap escapes; // characters that mark the start of an escape sequence and validate them
bool numbers; // Anything that resembles a number
enum token_ : uint8_t {
token_text = 0,
token_integer,
token_string,
token_newline,
token_escaped,
token_operator,
token_bracket,
token_quoted,
num_token_types
};
using token = pair<string, uint8_t>;
private:
static constexpr uint8_t token_delimiter = num_token_types;
constexpr list<token> operator()(const string& line) {
list<token> res;
priority_queue<pair<size_t, uint8_t>> idx;
for (char c : delimiter) {
size_t i = 0;
while (i != line.size()) {
size_t n = line.find(c, i);
// TODO
}
}
return res;
}
private:
};
}
#endif // FENNEC_INTERPRET_TOKENIZER_H