Add String/Format commentary

2025-08-18 22:21:21 +02:00 · 2018-05-07 06:22:24 +03:00
parent 56cba4595c
commit e0af41350a
3 changed files with 241 additions and 1 deletions
--- a/src/common/Format.h
+++ b/src/common/Format.h
@@ -5,6 +5,54 @@
 #include "common/String.h"
 /*
 	The following formatting specifiers can be written into a
 	ByteStringBuilder or a StringBuilder:
 	Format::Oct(value)
 	Format::Dec(value)
 	Format::Hex(value)
 		Write the value in the specified base.
 	Format::Uppercase(value)
 	Format::NoUppercase(value)
 		Toggle uppercase characters in hexadecimal and scientific form.
 	Format::ShowPoint(value)
 	Format::NoShowPoint(value)
 		In floats, toggle always displaying the decimal point even if
 		the number is an integer.
 	Format::Fixed(value)
 	Format::Scientific(value)
 		Display the float with a fixed number of digits after the
 		decimal point, or force using the scientific notation.
 	Format::Precision(value, size_t precision)
 		Fix the number of digits of precision used for floats. By
 		default also enables the Fixed mode.
 	Format::Width(value, size_t width)
 		Fix the number of characters used to represent the value. By
 		default also sets the fill to the digit '0'.
 	All of the above can be written into builders with the value argument
 	omitted. In that case the specifiers will affect all future writes to
 	the builder.
 	The following formatting specifiers can be passed to string functions
 	SplitNumber and ToNumber:
 	Format::Oct()
 	Format::Dec()
 	Format::Hex()
 		Read the value in the specified base.
 	Format::SkipWS()
 	Format::NoSkipWS()
 		Toggle ignoring the whitespace when reading a number.
 */
 template<typename T> class FormatProxy
 {
 	T const &value;
--- a/src/common/String.cpp
+++ b/src/common/String.cpp
@@ -15,7 +15,7 @@ ByteString ConversionError::formatError(ByteString::value_type const *at, ByteSt
 	return ss.str();
 }
-
+// The STL-packaged standardized UTF-8 conversion interface
 static std::codecvt_utf8<String::value_type> convert(1);
 std::vector<ByteString> ByteString::PartitionBy(value_type ch, bool includeEmpty) const
@@ -219,6 +219,47 @@ ByteString String::ToUtf8() const
 	}
 }
 /*
 	Due to unknown reasons, the STL basically doesn't support string-number
 	conversions for char32_t strings. Under the hood all stream objects use
 	the so-called locale facets to do number formatting and parsing. As the
 	name implies the facets can depend on the currently chosen locale, but
 	they are also specialized by the type of characters that are used in
 	the strings that are written/read.
 	Of particular interest are the std::num_put<T> and std::num_get<T>
 	facets. In accordance with the standard the two class templates are
 	defined, and then specialized to char and wchar_t types. But the
 	generic class template does not implement all the necessary methods,
 	leaving you with undefined reference errors. Manually providing
 	implementations for such methods is a not a portable solution.
 	Therefore we provide our own number reading/writing interface, detached
 	from std::basic_stringstream.
 	We would nevertheless like to avoid writing all the conversion code
 	ourselves and use STL as much as possible. As it turns out std::num_put
 	and std::num_get are too wired into std::ios_base and thus are unusable
 	in separation from an STL stream object.
 	A hacky but simple solution is to create a static thread-local
 	std::wstringstream initialized to the C locale (setting the locale of a
 	temporarily created stream every time might be too expensive). Number
 	serialization then simply uses operator<< and then manually widens the
 	produced wchar_t's into char32_t's. Number parsing is more tricky and
 	narrows only a prefix of the parsed string: it selects only characters
 	that could be a part of a number as in "Stage 2" in
 	[facet.num.get.virtuals], narrows them and uses operator>>. The number
 	of characters consumed is used to take an offset into the original
 	string.
 	A std::stringstream is added in the same way for symmetry with
 	ByteStringStream and follows the same protocol except it doesn't
 	perform and narrowing or widening.
 	The nice thing above the *_wchar functions immediately below is that on
 	platforms where wchar_t has 32 bits these should be a no-op.
 */
 inline String::value_type widen_wchar(wchar_t ch)
 {
 	return std::make_unsigned<wchar_t>::type(ch);
--- a/src/common/String.h
+++ b/src/common/String.h
@@ -6,6 +6,157 @@
 #include <string>
 #include <ios>
 /*
 	There are two "string" classes: ByteString and String. They have nearly
 	identical interfaces, except that one stores 8-bit octets (bytes) and
 	the other stores Unicode codepoints.
 	Both classes inherit from std::basic_string (std::string is an
 	instatiation of that), so all the familiar string interface is there
 	however some helper methods have been defined:
 	Substr(size_t start = 0, size_t count = npos)
 		Returns a substring starting at position <start> and counting
 		<count> symbols, or until end of string, whichever is earlier.
 		If count == npos, the entire remainder of the string is
 		included.
 	SubstrFromEnd(size_t rstart = 0, size_t rcount = npos)
 		Behaviourally equal to
 			reverse(reverse(str).Substr(rstart, rcount))
 		but is more efficient. Useful for taking suffixes of given
 		length or dropping a fixed number of symbols from the end.
 	Between(size_t begin, size_t end)
 		Returns a substring starting at <begin> and ending at <end>.
 		If end == npos, length of the string is used. If begin > end,
 		an empty string is returned.
 	Insert(size_t pos, String str)
 		Inserts the characters from <str> at position <pos> shifting
 		the rest of the string to the right.
 	Erase(size_t pos, size_t count)
 		Starting at position <pos> erases <count> characters to the
 		right or until the end of string. The rest of the string is
 		shifted left to fill the gap.
 	EraseBetween(size_t from, size_t to)
 		Starting at position <pos> erase until position <to> or end of
 		string, whichever is earlier. The rest of the string is shifted
 		left to fill the gap.
 	BeginsWith(String prefix)
 	EndsWith(String suffix)
 	Contains(String infix)
 	Contains(value_type infix)
 		Self-explanatory.
 	ByteString::FromUtf8(bool ignoreError = true)
 		Decodes UTF-8 byte sequences into Unicode codepoints.
 		If ignoreError is true, then invalid byte sequences are widened
 		as-is. Otherwise, a ConversionError is thrown.
 	ByteString::FromAscii()
 		Interprets byte values as Unicode codepoints.
 	String::ToUtf8()
 		Encodes Unicode codepoints into UTF-8 byte sequences.
 	String::ToAscii()
 		Narrows Unicode codepoints into bytes, possibly truncating
 		them (!).
 	To convert something into a string use ByteStringBuilder and
 	StringBuilder respectively. The two use operator<< much like
 	std::ostringstream. To convert a builder to a string use the Build()
 	method. Alternatively you could use the static ByteString::Build and
 	String::Build methods respectively. String::Build(x, y, z) is roughly
 	equivalent to:
 		StringBuilder tmp;
 		tmp << x << y << z;
 		return tmp.Build();
 	To control formatting of the input/output see "common/Format.h".
 	If you simply want to convert a string to a number you can use the
 	ToNumber<type>(bool ignoreError = false) method. It can optionally take
 	a formatting specifier as an argument:
 		str.ToNumber<unsigned>(Format::Hex(), true)
 	Otherwise to parse a string into components you can use splits. A Split
 	is a temporary object that "remembers" how a string is divided into a
 	"prefix", a "separator", and a "suffix". A split can also "fail", if,
 	for example the separator was not found in the string. A Split has the
 	following methods:
 	Before(bool includeSeparator = false)
 		Returns the "prefix", optionally with the "separator".
 	After(bool includeSeparator = false)
 		Returns the "suffix", optionally with the "separator".
 	A Split can also be converted to bool (used in the condition of an 'if'
 	or a 'while'), in which case it shows whether the split had succeeded
 	or failed. The idiomatic code goes like:
 		if(String::Split split = str.SplitBy(','))
 			// use split.Before() and split.After()
 		else
 			// str does not contain a ','
 	The following methods split a string:
 	SplitBy(String sep, size_t from = 0)
 	SplitBy(value_type sep, size_t from = 0)
 		Split on the first occurence of the <sep> separator since
 		position <from>. If no such separator is found the split fails
 		and "prefix" contains the whole string starting from <from>.
 	SplitByAny(String chars, size_t from = 0)
 		Split on the first occurence of any of the characters in
 		<chars>.
 	SplitByNot(String chars, size_t from = 0)
 		Split on the first occurence of any character that is *not* in
 		<chars>.
 	SplitFromEndBy(String sep, size_t from = npos)
 	SplitFromEndBy(value_type sep, size_t from = npos)
 	SplitFromEndByAny(String chars, size_t from = npos)
 	SplitFromEndByNot(String chars, size_t from = npos)
 		These do the same as the functions above except they try to
 		find the *last* occurence of the separator. If the split fails
 		it is the "suffix" that contains the whole string, and the
 		"prefix" is empty instead.
 	SplitNumber(number &ref, size_t pos = 0)
 		Attempt to read a number (with the type indicated by the type
 		of the reference) from position <pos>. In case of success store
 		the parsed number in the reference and return a split at the
 		end of the parse. The separator in this case is empty. If the
 		parse fails, the "prefix" is empty.
 	SplitNumber(number &ref, format, size_t pos = 0)
 		Parse the number according to the provided formatting
 		specifier.
 	It is recommented to use ByteString::value_type and String::value_type
 	instead of char and char32_t respectively.
 	The reason we do not use std::wstring is that on Windows, wchar_t is a
 	16-bit type, which forces the usage of a UTF-16 (UCS-2) encoding to
 	store the higher parts of the Unicode. The std::wstring implementation
 	does not care to handle the UTF-16 encoding. Considering characters
 	still occupy a variable amount of space (1 or 2 wchar_t's), finding the
 	"n'th character" or "index of character at offset n" becomes a problem.
 	(Event if that were not a problem a better solution would be to use the
 	more space-efficient UTF-8). Therefore the String class is derived from
 	std::basic_string<char32_t>, where char32_t is a type that is
 	guaranteed to contain at least 32 bits. The drawback is that we
 	basically lose std::stringstream (std::basic_stringstream) support and
 	have to implement our own (See "common/String.cpp").
 */
 class ByteStringBuilder;
 class String;
 class StringBuilder;