<listitem>
<para>
The <literal>builtin</literal> provider uses built-in operations. Only
- the <literal>C</literal> and <literal>C.UTF-8</literal> locales are
- supported for this provider.
+ the <literal>C</literal>, <literal>C.UTF-8</literal>, and
+ <literal>PG_UNICODE_FAST</literal> locales are supported for this
+ provider.
</para>
<para>
The <literal>C</literal> locale behavior is identical to the
regular expression character classes are based on the "POSIX
Compatible" semantics, and the case mapping is the "simple" variant.
</para>
+ <para>
+ The <literal>PG_UNICODE_FAST</literal> locale is available only when
+ the database encoding is <literal>UTF-8</literal>, and the behavior is
+ based on Unicode. The collation uses the code point values only. The
+ regular expression character classes are based on the "Standard"
+ semantics, and the case mapping is the "full" variant.
+ </para>
</listitem>
</varlistentry>
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>pg_unicode_fast</literal></term>
+ <listitem>
+ <para>
+ This collation sorts by Unicode code point values rather than natural
+ language order. For the functions <function>lower</function>,
+ <function>initcap</function>, and <function>upper</function> it uses
+ Unicode full case mapping. For pattern matching (including regular
+ expressions), it uses the Standard variant of Unicode <ulink
+ url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
+ Properties</ulink>. Behavior is efficient and stable within a
+ <productname>Postgres</productname> major version. It is only
+ available for encoding <literal>UTF8</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><literal>pg_c_utf8</literal></term>
<listitem>
<para>
If <replaceable>provider</replaceable> is <literal>builtin</literal>,
then <replaceable>locale</replaceable> must be specified and set to
- either <literal>C</literal> or <literal>C.UTF-8</literal>.
+ either <literal>C</literal>, <literal>C.UTF-8</literal> or
+ <literal>PG_UNICODE_FAST</literal>.
</para>
</listitem>
</varlistentry>
If <xref linkend="create-database-locale-provider"/> is
<literal>builtin</literal>, then <replaceable>locale</replaceable> or
<replaceable>builtin_locale</replaceable> must be specified and set to
- either <literal>C</literal> or <literal>C.UTF-8</literal>.
+ either <literal>C</literal>, <literal>C.UTF-8</literal>, or
+ <literal>PG_UNICODE_FAST</literal>.
</para>
<tip>
<para>
</para>
<para>
The locales available for the <literal>builtin</literal> provider are
- <literal>C</literal> and <literal>C.UTF-8</literal>.
+ <literal>C</literal>, <literal>C.UTF-8</literal> and
+ <literal>PG_UNICODE_FAST</literal>.
</para>
</listitem>
</varlistentry>
<para>
If <option>--locale-provider</option> is <literal>builtin</literal>,
<option>--locale</option> or <option>--builtin-locale</option> must be
- specified and set to <literal>C</literal> or
- <literal>C.UTF-8</literal>.
+ specified and set to <literal>C</literal>, <literal>C.UTF-8</literal>
+ or <literal>PG_UNICODE_FAST</literal>.
</para>
</listitem>
</varlistentry>
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISDIGIT));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isdigit(c, true);
+ return pg_u_isdigit(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISALNUM));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isalnum(c, true);
+ return pg_u_isalnum(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
return (c <= (pg_wchar) 127 &&
(pg_char_properties[c] & PG_ISPUNCT));
case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_ispunct(c, true);
+ return pg_u_ispunct(c, !pg_regex_locale->info.builtin.casemap_full);
case PG_REGEX_STRATEGY_LIBC_WIDE:
if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
{
if (strcmp(locale, "C") == 0)
return -1;
- if (strcmp(locale, "C.UTF-8") == 0)
+ else if (strcmp(locale, "C.UTF-8") == 0)
return PG_UTF8;
+ else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+ return PG_UTF8;
+
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
canonical_name = "C";
else if (strcmp(locale, "C.UTF-8") == 0 || strcmp(locale, "C.UTF8") == 0)
canonical_name = "C.UTF-8";
+ else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
+ canonical_name = "PG_UNICODE_FAST";
if (!canonical_name)
ereport(ERROR,
strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- return unicode_strlower(dest, destsize, src, srclen, false);
+ return unicode_strlower(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full);
}
size_t
.prev_alnum = false,
};
- return unicode_strtitle(dest, destsize, src, srclen, false,
+ return unicode_strtitle(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full,
initcap_wbnext, &wbstate);
}
strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- return unicode_strupper(dest, destsize, src, srclen, false);
+ return unicode_strupper(dest, destsize, src, srclen,
+ locale->info.builtin.casemap_full);
}
pg_locale_t
result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
result->info.builtin.locale = MemoryContextStrdup(context, locstr);
+ result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
result->provider = COLLPROVIDER_BUILTIN;
result->deterministic = true;
result->collate_is_c = true;
return "1";
else if (strcmp(collcollate, "C.UTF-8") == 0)
return "1";
+ else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
+ return "1";
else
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
else if (strcmp(datlocale, "C.UTF-8") == 0 ||
strcmp(datlocale, "C.UTF8") == 0)
canonname = "C.UTF-8";
+ else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0)
+ canonname = "PG_UNICODE_FAST";
else
pg_fatal("invalid locale name \"%s\" for builtin provider",
datlocale);
if (locale_provider == COLLPROVIDER_BUILTIN)
{
- if (strcmp(datlocale, "C.UTF-8") == 0 && encodingid != PG_UTF8)
+ if ((strcmp(datlocale, "C.UTF-8") == 0 ||
+ strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
+ encodingid != PG_UTF8)
pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
datlocale, "UTF-8");
}
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202501162
+#define CATALOG_VERSION_NO 202501171
#endif
descr => 'sorts by Unicode code point; Unicode and POSIX character semantics',
collname => 'pg_c_utf8', collprovider => 'b', collencoding => '6',
colllocale => 'C.UTF-8', collversion => '1' },
+{ oid => '9535', descr => 'sorts by Unicode code point; Unicode character semantics',
+ collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6',
+ colllocale => 'PG_UNICODE_FAST', collversion => '1' },
]
struct
{
const char *locale;
+ bool casemap_full;
} builtin;
locale_t lt;
#ifdef USE_ICU
t
(1 row)
+--
+-- Test PG_UNICODE_FAST
+--
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'unicode'); -- fails
+ERROR: invalid locale name "unicode" for builtin provider
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'PG_UNICODE_FAST');
+CREATE TABLE test_pg_unicode_fast (
+ t TEXT COLLATE PG_UNICODE_FAST
+);
+INSERT INTO test_pg_unicode_fast VALUES
+ ('abc DEF 123abc'),
+ ('ábc sßs ßss DÉF'),
+ ('DŽxxDŽ džxxDž Džxxdž'),
+ ('ȺȺȺ'),
+ ('ⱥⱥⱥ'),
+ ('ⱥȺ');
+SELECT
+ t, lower(t), initcap(t), upper(t),
+ length(convert_to(t, 'UTF8')) AS t_bytes,
+ length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+ length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+ length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+ FROM test_pg_unicode_fast;
+ t | lower | initcap | upper | t_bytes | lower_t_bytes | initcap_t_bytes | upper_t_bytes
+-----------------+-----------------+------------------+-------------------+---------+---------------+-----------------+---------------
+ abc DEF 123abc | abc def 123abc | Abc Def 123abc | ABC DEF 123ABC | 14 | 14 | 14 | 14
+ ábc sßs ßss DÉF | ábc sßs ßss déf | Ábc Sßs Ssss Déf | ÁBC SSSS SSSS DÉF | 19 | 19 | 19 | 19
+ DŽxxDŽ džxxDž Džxxdž | džxxdž džxxdž džxxdž | Džxxdž Džxxdž Džxxdž | DŽXXDŽ DŽXXDŽ DŽXXDŽ | 20 | 20 | 20 | 20
+ ȺȺȺ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 6 | 9 | 8 | 6
+ ⱥⱥⱥ | ⱥⱥⱥ | Ⱥⱥⱥ | ȺȺȺ | 9 | 9 | 8 | 6
+ ⱥȺ | ⱥⱥ | Ⱥⱥ | ȺȺ | 5 | 6 | 5 | 4
+(6 rows)
+
+DROP TABLE test_pg_unicode_fast;
+-- test Final_Sigma
+SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
+ lower
+-------
+ ας
+(1 row)
+
+SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
+ lower
+-------
+ ας0
+(1 row)
+
+SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
+ lower
+-------
+ ἀς̓
+(1 row)
+
+SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
+ lower
+-------
+ ᾳςͅ
+(1 row)
+
+-- test !Final_Sigma
+SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
+ lower
+-------
+ σ
+(1 row)
+
+SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
+ lower
+-------
+ 0σ
+(1 row)
+
+SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
+ lower
+-------
+ ασα
+(1 row)
+
+SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
+ lower
+-------
+ ἀσ̓α
+(1 row)
+
+SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+ lower
+-------
+ ᾳσͅα
+(1 row)
+
+-- properties
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+-- case mapping
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed
+ ?column?
+----------
+ t
+(1 row)
+
SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
+
+--
+-- Test PG_UNICODE_FAST
+--
+
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'unicode'); -- fails
+CREATE COLLATION regress_pg_unicode_fast (
+ provider = builtin, locale = 'PG_UNICODE_FAST');
+
+CREATE TABLE test_pg_unicode_fast (
+ t TEXT COLLATE PG_UNICODE_FAST
+);
+INSERT INTO test_pg_unicode_fast VALUES
+ ('abc DEF 123abc'),
+ ('ábc sßs ßss DÉF'),
+ ('DŽxxDŽ džxxDž Džxxdž'),
+ ('ȺȺȺ'),
+ ('ⱥⱥⱥ'),
+ ('ⱥȺ');
+
+SELECT
+ t, lower(t), initcap(t), upper(t),
+ length(convert_to(t, 'UTF8')) AS t_bytes,
+ length(convert_to(lower(t), 'UTF8')) AS lower_t_bytes,
+ length(convert_to(initcap(t), 'UTF8')) AS initcap_t_bytes,
+ length(convert_to(upper(t), 'UTF8')) AS upper_t_bytes
+ FROM test_pg_unicode_fast;
+
+DROP TABLE test_pg_unicode_fast;
+
+-- test Final_Sigma
+SELECT lower('ΑΣ' COLLATE PG_UNICODE_FAST); -- 0391 03A3
+SELECT lower('ΑΣ0' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0030
+SELECT lower('ἈΣ̓' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343
+SELECT lower('ᾼΣͅ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345
+
+-- test !Final_Sigma
+SELECT lower('Σ' COLLATE PG_UNICODE_FAST); -- 03A3
+SELECT lower('0Σ' COLLATE PG_UNICODE_FAST); -- 0030 03A3
+SELECT lower('ΑΣΑ' COLLATE PG_UNICODE_FAST); -- 0391 03A3 0391
+SELECT lower('ἈΣ̓Α' COLLATE PG_UNICODE_FAST); -- 0391 0343 03A3 0343 0391
+SELECT lower('ᾼΣͅΑ' COLLATE PG_UNICODE_FAST); -- 0391 0345 03A3 0345 0391
+
+-- properties
+
+SELECT 'xyz' ~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+SELECT 'xyz' !~ '[[:upper:]]' COLLATE PG_UNICODE_FAST;
+SELECT '@' !~ '[[:alnum:]]' COLLATE PG_UNICODE_FAST;
+SELECT '=' !~ '[[:punct:]]' COLLATE PG_UNICODE_FAST; -- symbols are not punctuation
+SELECT 'a8a' ~ '[[:digit:]]' COLLATE PG_UNICODE_FAST;
+SELECT '൧' ~ '\d' COLLATE PG_UNICODE_FAST;
+
+-- case mapping
+
+SELECT 'xYz' ~* 'XyZ' COLLATE PG_UNICODE_FAST;
+SELECT 'xAb' ~* '[W-Y]' COLLATE PG_UNICODE_FAST;
+SELECT 'xAb' !~* '[c-d]' COLLATE PG_UNICODE_FAST;
+SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_UNICODE_FAST;
+SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases reversed