fennec/include/fennec/langproc/compile/tokenizer.h

// =====================================================================================================================
//  fennec, a free and open source game engine
//  Copyright © 2025  Medusa Slockbower
//
//  This program is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <https://www.gnu.org/licenses/>.
// =====================================================================================================================

///
/// \file tokenizer.h
/// \brief
///
///
/// \details
/// \author Medusa Slockbower
///
/// \copyright Copyright © 2025 Medusa Slockbower ([GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html))
///
///

#ifndef FENNEC_LANGPROC_FORMAT_TOKENIZER_H
#define FENNEC_LANGPROC_FORMAT_TOKENIZER_H

#include <fennec/containers/list.h>
#include <fennec/langproc/strings/string.h>

//
// escape sequences are tricky, sometimes they must be separated by white space,
// other times they don't. Requiring a list of all possible escape sequences is unrealistic.
// We need to allow the user of this struct to specify rules for escape sequences. Here are some basic rules:
//
// An escape sequence is marked by an escape character, e.g. %, \, {{
// Multiple escape characters may be used in a single tokenizer and will have different rules
// Escape characters may also be operators, brackets, or quotes
// Escape sequences may contain operators, brackets, or quotes
//
// Here are a few examples of escape sequences from various formats and languages
// C: \\, \n, \0, \u200b
// PrintF: %s, %2.2f
// Python FMT: {{, }}
// SPSS: ''
//

namespace fennec
{

struct escape_sequence {
	virtual size_t operator[](const std::string& str, size_t i) = 0;
};

struct tokenizer {
	using escseq = escape_sequence*;
	using escmap = map<char, escape_sequence*>;

	string delimiter; // markers that separate tokens
	string operators; // operators are treated as individual tokens
	string brackets;  // characters that mark brackets
	string quotes;    // characters that mark a string sequence, entire string sequence is treated as one token
	escmap escapes;   // characters that mark the start of an escape sequence and validate them
	bool   numbers;   // Anything that resembles a number

	enum token_ : uint8_t {
		token_text     = 0,
		token_integer,
		token_string,
		token_newline,
		token_escaped,
		token_operator,
		token_bracket,
		token_quoted,

		num_token_types
	};

	using token = pair<string, uint8_t>;

private:
	static constexpr uint8_t token_delimiter = num_token_types;

	constexpr list<token> operator()(const string& line) {
		list<token> res;
		priority_queue<pair<size_t, uint8_t>> idx;

		for (size_t i = 0; i < line.size(); ++i) {

			for (char c : delimiter) {
				idx.emplace()
			}

		}

		return res;
	}

private:
};

}

#endif // FENNEC_LANGPROC_FORMAT_TOKENIZER_H