| /* GRegex -- regular expression API wrapper around PCRE. |
| * |
| * Copyright (C) 1999, 2000 Scott Wimer |
| * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> |
| * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> |
| * Copyright (C) 2022, Marco Trevisan <marco.trevisan@canonical.com> |
| * |
| * SPDX-License-Identifier: LGPL-2.1-or-later |
| * |
| * This library is free software; you can redistribute it and/or |
| * modify it under the terms of the GNU Lesser General Public |
| * License as published by the Free Software Foundation; either |
| * version 2.1 of the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, |
| * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public License |
| * along with this library; if not, see <http://www.gnu.org/licenses/>. |
| */ |
| |
| #include "config.h" |
| |
| #include <stdint.h> |
| #include <string.h> |
| |
| #define PCRE2_CODE_UNIT_WIDTH 8 |
| #include <pcre2.h> |
| |
| #include "gtypes.h" |
| #include "gregex.h" |
| #include "glibintl.h" |
| #include "glist.h" |
| #include "gmessages.h" |
| #include "gstrfuncs.h" |
| #include "gatomic.h" |
| #include "gtestutils.h" |
| #include "gthread.h" |
| |
| /** |
| * GRegex: |
| * |
| * A `GRegex` is the "compiled" form of a regular expression pattern. |
| * |
| * `GRegex` implements regular expression pattern matching using syntax and |
| * semantics similar to Perl regular expression. See the |
| * [PCRE documentation](man:pcrepattern(3)) for the syntax definition. |
| * |
| * Some functions accept a @start_position argument, setting it differs |
| * from just passing over a shortened string and setting %G_REGEX_MATCH_NOTBOL |
| * in the case of a pattern that begins with any kind of lookbehind assertion. |
| * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" |
| * in the middle of words. ("\B" matches only if the current position in the |
| * subject is not a word boundary.) When applied to the string "Mississipi" |
| * from the fourth byte, namely "issipi", it does not match, because "\B" is |
| * always false at the start of the subject, which is deemed to be a word |
| * boundary. However, if the entire string is passed , but with |
| * @start_position set to 4, it finds the second occurrence of "iss" because |
| * it is able to look behind the starting point to discover that it is |
| * preceded by a letter. |
| * |
| * Note that, unless you set the %G_REGEX_RAW flag, all the strings passed |
| * to these functions must be encoded in UTF-8. The lengths and the positions |
| * inside the strings are in bytes and not in characters, so, for instance, |
| * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a |
| * single character. If you set %G_REGEX_RAW the strings can be non-valid |
| * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two |
| * bytes and two characters long. |
| * |
| * When matching a pattern, "\n" matches only against a "\n" character in |
| * the string, and "\r" matches only a "\r" character. To match any newline |
| * sequence use "\R". This particular group matches either the two-character |
| * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, |
| * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), |
| * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line |
| * separator, U+2028), or PS (paragraph separator, U+2029). |
| * |
| * The behaviour of the dot, circumflex, and dollar metacharacters are |
| * affected by newline characters, the default is to recognize any newline |
| * character (the same characters recognized by "\R"). This can be changed |
| * with `G_REGEX_NEWLINE_CR`, `G_REGEX_NEWLINE_LF` and `G_REGEX_NEWLINE_CRLF` |
| * compile options, and with `G_REGEX_MATCH_NEWLINE_ANY`, |
| * `G_REGEX_MATCH_NEWLINE_CR`, `G_REGEX_MATCH_NEWLINE_LF` and |
| * `G_REGEX_MATCH_NEWLINE_CRLF` match options. These settings are also |
| * relevant when compiling a pattern if `G_REGEX_EXTENDED` is set, and an |
| * unescaped "#" outside a character class is encountered. This indicates |
| * a comment that lasts until after the next newline. |
| * |
| * Creating and manipulating the same `GRegex` structure from different |
| * threads is not a problem as `GRegex` does not modify its internal |
| * state between creation and destruction, on the other hand `GMatchInfo` |
| * is not threadsafe. |
| * |
| * The regular expressions low-level functionalities are obtained through |
| * the excellent [PCRE](http://www.pcre.org/) library written by Philip Hazel. |
| * |
| * Since: 2.14 |
| */ |
| |
| #define G_REGEX_PCRE_GENERIC_MASK (PCRE2_ANCHORED | \ |
| PCRE2_NO_UTF_CHECK | \ |
| PCRE2_ENDANCHORED) |
| |
| /* Mask of all the possible values for GRegexCompileFlags. */ |
| #define G_REGEX_COMPILE_MASK (G_REGEX_DEFAULT | \ |
| G_REGEX_CASELESS | \ |
| G_REGEX_MULTILINE | \ |
| G_REGEX_DOTALL | \ |
| G_REGEX_EXTENDED | \ |
| G_REGEX_ANCHORED | \ |
| G_REGEX_DOLLAR_ENDONLY | \ |
| G_REGEX_UNGREEDY | \ |
| G_REGEX_RAW | \ |
| G_REGEX_NO_AUTO_CAPTURE | \ |
| G_REGEX_OPTIMIZE | \ |
| G_REGEX_FIRSTLINE | \ |
| G_REGEX_DUPNAMES | \ |
| G_REGEX_NEWLINE_CR | \ |
| G_REGEX_NEWLINE_LF | \ |
| G_REGEX_NEWLINE_CRLF | \ |
| G_REGEX_NEWLINE_ANYCRLF | \ |
| G_REGEX_BSR_ANYCRLF) |
| |
| #define G_REGEX_PCRE2_COMPILE_MASK (PCRE2_ALLOW_EMPTY_CLASS | \ |
| PCRE2_ALT_BSUX | \ |
| PCRE2_AUTO_CALLOUT | \ |
| PCRE2_CASELESS | \ |
| PCRE2_DOLLAR_ENDONLY | \ |
| PCRE2_DOTALL | \ |
| PCRE2_DUPNAMES | \ |
| PCRE2_EXTENDED | \ |
| PCRE2_FIRSTLINE | \ |
| PCRE2_MATCH_UNSET_BACKREF | \ |
| PCRE2_MULTILINE | \ |
| PCRE2_NEVER_UCP | \ |
| PCRE2_NEVER_UTF | \ |
| PCRE2_NO_AUTO_CAPTURE | \ |
| PCRE2_NO_AUTO_POSSESS | \ |
| PCRE2_NO_DOTSTAR_ANCHOR | \ |
| PCRE2_NO_START_OPTIMIZE | \ |
| PCRE2_UCP | \ |
| PCRE2_UNGREEDY | \ |
| PCRE2_UTF | \ |
| PCRE2_NEVER_BACKSLASH_C | \ |
| PCRE2_ALT_CIRCUMFLEX | \ |
| PCRE2_ALT_VERBNAMES | \ |
| PCRE2_USE_OFFSET_LIMIT | \ |
| PCRE2_EXTENDED_MORE | \ |
| PCRE2_LITERAL | \ |
| PCRE2_MATCH_INVALID_UTF | \ |
| G_REGEX_PCRE_GENERIC_MASK) |
| |
| #define G_REGEX_COMPILE_NONPCRE_MASK (PCRE2_UTF) |
| |
| /* Mask of all the possible values for GRegexMatchFlags. */ |
| #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_DEFAULT | \ |
| G_REGEX_MATCH_ANCHORED | \ |
| G_REGEX_MATCH_NOTBOL | \ |
| G_REGEX_MATCH_NOTEOL | \ |
| G_REGEX_MATCH_NOTEMPTY | \ |
| G_REGEX_MATCH_PARTIAL | \ |
| G_REGEX_MATCH_NEWLINE_CR | \ |
| G_REGEX_MATCH_NEWLINE_LF | \ |
| G_REGEX_MATCH_NEWLINE_CRLF | \ |
| G_REGEX_MATCH_NEWLINE_ANY | \ |
| G_REGEX_MATCH_NEWLINE_ANYCRLF | \ |
| G_REGEX_MATCH_BSR_ANYCRLF | \ |
| G_REGEX_MATCH_BSR_ANY | \ |
| G_REGEX_MATCH_PARTIAL_SOFT | \ |
| G_REGEX_MATCH_PARTIAL_HARD | \ |
| G_REGEX_MATCH_NOTEMPTY_ATSTART) |
| |
| #define G_REGEX_PCRE2_MATCH_MASK (PCRE2_NOTBOL |\ |
| PCRE2_NOTEOL |\ |
| PCRE2_NOTEMPTY |\ |
| PCRE2_NOTEMPTY_ATSTART |\ |
| PCRE2_PARTIAL_SOFT |\ |
| PCRE2_PARTIAL_HARD |\ |
| PCRE2_NO_JIT |\ |
| PCRE2_COPY_MATCHED_SUBJECT |\ |
| G_REGEX_PCRE_GENERIC_MASK) |
| |
| /* TODO: Support PCRE2_NEWLINE_NUL */ |
| #define G_REGEX_NEWLINE_MASK (PCRE2_NEWLINE_CR | \ |
| PCRE2_NEWLINE_LF | \ |
| PCRE2_NEWLINE_CRLF | \ |
| PCRE2_NEWLINE_ANYCRLF) |
| |
| /* Some match options are not supported when using JIT as stated in the |
| * pcre2jit man page under the «UNSUPPORTED OPTIONS AND PATTERN ITEMS» section: |
| * https://www.pcre.org/current/doc/html/pcre2jit.html#SEC5 |
| */ |
| #define G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS (PCRE2_ANCHORED | \ |
| PCRE2_ENDANCHORED) |
| |
| #define G_REGEX_COMPILE_NEWLINE_MASK (G_REGEX_NEWLINE_CR | \ |
| G_REGEX_NEWLINE_LF | \ |
| G_REGEX_NEWLINE_CRLF | \ |
| G_REGEX_NEWLINE_ANYCRLF) |
| |
| #define G_REGEX_MATCH_NEWLINE_MASK (G_REGEX_MATCH_NEWLINE_CR | \ |
| G_REGEX_MATCH_NEWLINE_LF | \ |
| G_REGEX_MATCH_NEWLINE_CRLF | \ |
| G_REGEX_MATCH_NEWLINE_ANY | \ |
| G_REGEX_MATCH_NEWLINE_ANYCRLF) |
| |
| /* if the string is in UTF-8 use g_utf8_ functions, else use |
| * use just +/- 1. */ |
| #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
| ((s) + 1) : \ |
| g_utf8_next_char (s)) |
| #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
| ((s) - 1) : \ |
| g_utf8_prev_char (s)) |
| |
| struct _GMatchInfo |
| { |
| gint ref_count; /* the ref count (atomic) */ |
| GRegex *regex; /* the regex */ |
| uint32_t match_opts; /* pcre match options used at match time on the regex */ |
| gint matches; /* number of matching sub patterns, guaranteed to be <= (n_subpatterns + 1) if doing a single match (rather than matching all) */ |
| uint32_t n_subpatterns; /* total number of sub patterns in the regex */ |
| gint pos; /* position in the string where last match left off */ |
| uint32_t n_offsets; /* number of offsets */ |
| gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ |
| gint *workspace; /* workspace for pcre2_dfa_match() */ |
| PCRE2_SIZE n_workspace; /* number of workspace elements */ |
| const gchar *string; /* string passed to the match function */ |
| gssize string_len; /* length of string, in bytes */ |
| pcre2_match_context *match_context; |
| pcre2_match_data *match_data; |
| pcre2_jit_stack *jit_stack; |
| }; |
| |
| typedef enum |
| { |
| JIT_STATUS_DEFAULT, |
| JIT_STATUS_ENABLED, |
| JIT_STATUS_DISABLED |
| } JITStatus; |
| |
| struct _GRegex |
| { |
| gint ref_count; /* the ref count for the immutable part (atomic) */ |
| gchar *pattern; /* the pattern */ |
| pcre2_code *pcre_re; /* compiled form of the pattern */ |
| uint32_t compile_opts; /* options used at compile time on the pattern, pcre2 values */ |
| GRegexCompileFlags orig_compile_opts; /* options used at compile time on the pattern, gregex values */ |
| uint32_t match_opts; /* pcre2 options used at match time on the regex */ |
| GRegexMatchFlags orig_match_opts; /* options used as default match options, gregex values */ |
| uint32_t jit_options; /* options which were enabled for jit compiler */ |
| JITStatus jit_status; /* indicates the status of jit compiler for this compiled regex */ |
| /* The jit_status here does _not_ correspond to whether we used the JIT in the last invocation, |
| * which may be affected by match_options or a JIT_STACK_LIMIT error, but whether it was ever |
| * enabled for the current regex AND current set of jit_options. |
| * JIT_STATUS_DEFAULT means enablement was never tried, |
| * JIT_STATUS_ENABLED means it was tried and successful (even if we're not currently using it), |
| * and JIT_STATUS_DISABLED means it was tried and failed (so we shouldn't try again). |
| */ |
| }; |
| |
| /* TRUE if ret is an error code, FALSE otherwise. */ |
| #define IS_PCRE2_ERROR(ret) ((ret) < PCRE2_ERROR_NOMATCH && (ret) != PCRE2_ERROR_PARTIAL) |
| |
| typedef struct _InterpolationData InterpolationData; |
| static gboolean interpolation_list_needs_match (GList *list); |
| static gboolean interpolate_replacement (const GMatchInfo *match_info, |
| GString *result, |
| gpointer data); |
| static GList *split_replacement (const gchar *replacement, |
| GError **error); |
| static void free_interpolation_data (InterpolationData *data); |
| |
| static uint32_t |
| get_pcre2_compile_options (GRegexCompileFlags compile_flags) |
| { |
| /* Maps compile flags to pcre2 values */ |
| uint32_t pcre2_flags = 0; |
| |
| if (compile_flags & G_REGEX_CASELESS) |
| pcre2_flags |= PCRE2_CASELESS; |
| if (compile_flags & G_REGEX_MULTILINE) |
| pcre2_flags |= PCRE2_MULTILINE; |
| if (compile_flags & G_REGEX_DOTALL) |
| pcre2_flags |= PCRE2_DOTALL; |
| if (compile_flags & G_REGEX_EXTENDED) |
| pcre2_flags |= PCRE2_EXTENDED; |
| if (compile_flags & G_REGEX_ANCHORED) |
| pcre2_flags |= PCRE2_ANCHORED; |
| if (compile_flags & G_REGEX_DOLLAR_ENDONLY) |
| pcre2_flags |= PCRE2_DOLLAR_ENDONLY; |
| if (compile_flags & G_REGEX_UNGREEDY) |
| pcre2_flags |= PCRE2_UNGREEDY; |
| if (!(compile_flags & G_REGEX_RAW)) |
| pcre2_flags |= PCRE2_UTF; |
| if (compile_flags & G_REGEX_NO_AUTO_CAPTURE) |
| pcre2_flags |= PCRE2_NO_AUTO_CAPTURE; |
| if (compile_flags & G_REGEX_FIRSTLINE) |
| pcre2_flags |= PCRE2_FIRSTLINE; |
| if (compile_flags & G_REGEX_DUPNAMES) |
| pcre2_flags |= PCRE2_DUPNAMES; |
| |
| return pcre2_flags & G_REGEX_PCRE2_COMPILE_MASK; |
| } |
| |
| static uint32_t |
| get_pcre2_match_options (GRegexMatchFlags match_flags, |
| GRegexCompileFlags compile_flags) |
| { |
| /* Maps match flags to pcre2 values */ |
| uint32_t pcre2_flags = 0; |
| |
| if (match_flags & G_REGEX_MATCH_ANCHORED) |
| pcre2_flags |= PCRE2_ANCHORED; |
| if (match_flags & G_REGEX_MATCH_NOTBOL) |
| pcre2_flags |= PCRE2_NOTBOL; |
| if (match_flags & G_REGEX_MATCH_NOTEOL) |
| pcre2_flags |= PCRE2_NOTEOL; |
| if (match_flags & G_REGEX_MATCH_NOTEMPTY) |
| pcre2_flags |= PCRE2_NOTEMPTY; |
| if (match_flags & G_REGEX_MATCH_PARTIAL_SOFT) |
| pcre2_flags |= PCRE2_PARTIAL_SOFT; |
| if (match_flags & G_REGEX_MATCH_PARTIAL_HARD) |
| pcre2_flags |= PCRE2_PARTIAL_HARD; |
| if (match_flags & G_REGEX_MATCH_NOTEMPTY_ATSTART) |
| pcre2_flags |= PCRE2_NOTEMPTY_ATSTART; |
| |
| if (compile_flags & G_REGEX_RAW) |
| pcre2_flags |= PCRE2_NO_UTF_CHECK; |
| |
| return pcre2_flags & G_REGEX_PCRE2_MATCH_MASK; |
| } |
| |
| static GRegexCompileFlags |
| g_regex_compile_flags_from_pcre2 (uint32_t pcre2_flags) |
| { |
| GRegexCompileFlags compile_flags = G_REGEX_DEFAULT; |
| |
| if (pcre2_flags & PCRE2_CASELESS) |
| compile_flags |= G_REGEX_CASELESS; |
| if (pcre2_flags & PCRE2_MULTILINE) |
| compile_flags |= G_REGEX_MULTILINE; |
| if (pcre2_flags & PCRE2_DOTALL) |
| compile_flags |= G_REGEX_DOTALL; |
| if (pcre2_flags & PCRE2_EXTENDED) |
| compile_flags |= G_REGEX_EXTENDED; |
| if (pcre2_flags & PCRE2_ANCHORED) |
| compile_flags |= G_REGEX_ANCHORED; |
| if (pcre2_flags & PCRE2_DOLLAR_ENDONLY) |
| compile_flags |= G_REGEX_DOLLAR_ENDONLY; |
| if (pcre2_flags & PCRE2_UNGREEDY) |
| compile_flags |= G_REGEX_UNGREEDY; |
| if (!(pcre2_flags & PCRE2_UTF)) |
| compile_flags |= G_REGEX_RAW; |
| if (pcre2_flags & PCRE2_NO_AUTO_CAPTURE) |
| compile_flags |= G_REGEX_NO_AUTO_CAPTURE; |
| if (pcre2_flags & PCRE2_FIRSTLINE) |
| compile_flags |= G_REGEX_FIRSTLINE; |
| if (pcre2_flags & PCRE2_DUPNAMES) |
| compile_flags |= G_REGEX_DUPNAMES; |
| |
| return compile_flags & G_REGEX_COMPILE_MASK; |
| } |
| |
| static GRegexMatchFlags |
| g_regex_match_flags_from_pcre2 (uint32_t pcre2_flags) |
| { |
| GRegexMatchFlags match_flags = G_REGEX_MATCH_DEFAULT; |
| |
| if (pcre2_flags & PCRE2_ANCHORED) |
| match_flags |= G_REGEX_MATCH_ANCHORED; |
| if (pcre2_flags & PCRE2_NOTBOL) |
| match_flags |= G_REGEX_MATCH_NOTBOL; |
| if (pcre2_flags & PCRE2_NOTEOL) |
| match_flags |= G_REGEX_MATCH_NOTEOL; |
| if (pcre2_flags & PCRE2_NOTEMPTY) |
| match_flags |= G_REGEX_MATCH_NOTEMPTY; |
| if (pcre2_flags & PCRE2_PARTIAL_SOFT) |
| match_flags |= G_REGEX_MATCH_PARTIAL_SOFT; |
| if (pcre2_flags & PCRE2_PARTIAL_HARD) |
| match_flags |= G_REGEX_MATCH_PARTIAL_HARD; |
| if (pcre2_flags & PCRE2_NOTEMPTY_ATSTART) |
| match_flags |= G_REGEX_MATCH_NOTEMPTY_ATSTART; |
| |
| return (match_flags & G_REGEX_MATCH_MASK); |
| } |
| |
| static uint32_t |
| get_pcre2_newline_compile_options (GRegexCompileFlags compile_flags) |
| { |
| compile_flags &= G_REGEX_COMPILE_NEWLINE_MASK; |
| |
| switch (compile_flags) |
| { |
| case G_REGEX_NEWLINE_CR: |
| return PCRE2_NEWLINE_CR; |
| case G_REGEX_NEWLINE_LF: |
| return PCRE2_NEWLINE_LF; |
| case G_REGEX_NEWLINE_CRLF: |
| return PCRE2_NEWLINE_CRLF; |
| case G_REGEX_NEWLINE_ANYCRLF: |
| return PCRE2_NEWLINE_ANYCRLF; |
| default: |
| if (compile_flags != 0) |
| return 0; |
| |
| return PCRE2_NEWLINE_ANY; |
| } |
| } |
| |
| static uint32_t |
| get_pcre2_newline_match_options (GRegexMatchFlags match_flags) |
| { |
| switch (match_flags & G_REGEX_MATCH_NEWLINE_MASK) |
| { |
| case G_REGEX_MATCH_NEWLINE_CR: |
| return PCRE2_NEWLINE_CR; |
| case G_REGEX_MATCH_NEWLINE_LF: |
| return PCRE2_NEWLINE_LF; |
| case G_REGEX_MATCH_NEWLINE_CRLF: |
| return PCRE2_NEWLINE_CRLF; |
| case G_REGEX_MATCH_NEWLINE_ANY: |
| return PCRE2_NEWLINE_ANY; |
| case G_REGEX_MATCH_NEWLINE_ANYCRLF: |
| return PCRE2_NEWLINE_ANYCRLF; |
| default: |
| return 0; |
| } |
| } |
| |
| static uint32_t |
| get_pcre2_bsr_compile_options (GRegexCompileFlags compile_flags) |
| { |
| if (compile_flags & G_REGEX_BSR_ANYCRLF) |
| return PCRE2_BSR_ANYCRLF; |
| |
| return PCRE2_BSR_UNICODE; |
| } |
| |
| static uint32_t |
| get_pcre2_bsr_match_options (GRegexMatchFlags match_flags) |
| { |
| if (match_flags & G_REGEX_MATCH_BSR_ANYCRLF) |
| return PCRE2_BSR_ANYCRLF; |
| |
| if (match_flags & G_REGEX_MATCH_BSR_ANY) |
| return PCRE2_BSR_UNICODE; |
| |
| return 0; |
| } |
| |
| static char * |
| get_pcre2_error_string (int errcode) |
| { |
| PCRE2_UCHAR8 error_msg[2048]; |
| int err_length; |
| |
| err_length = pcre2_get_error_message (errcode, error_msg, |
| G_N_ELEMENTS (error_msg)); |
| |
| if (err_length <= 0) |
| return NULL; |
| |
| /* The array is always filled with a trailing zero */ |
| g_assert ((size_t) err_length < G_N_ELEMENTS (error_msg)); |
| return g_memdup2 (error_msg, err_length + 1); |
| } |
| |
| static const gchar * |
| translate_match_error (gint errcode) |
| { |
| switch (errcode) |
| { |
| case PCRE2_ERROR_NOMATCH: |
| /* not an error */ |
| break; |
| case PCRE2_ERROR_NULL: |
| /* NULL argument, this should not happen in GRegex */ |
| g_critical ("A NULL argument was passed to PCRE"); |
| break; |
| case PCRE2_ERROR_BADOPTION: |
| return "bad options"; |
| case PCRE2_ERROR_BADMAGIC: |
| return _("corrupted object"); |
| case PCRE2_ERROR_NOMEMORY: |
| return _("out of memory"); |
| case PCRE2_ERROR_NOSUBSTRING: |
| /* not used by pcre2_match() */ |
| break; |
| case PCRE2_ERROR_MATCHLIMIT: |
| case PCRE2_ERROR_CALLOUT: |
| /* callouts are not implemented */ |
| break; |
| case PCRE2_ERROR_BADUTFOFFSET: |
| /* we do not check if strings are valid */ |
| break; |
| case PCRE2_ERROR_PARTIAL: |
| /* not an error */ |
| break; |
| case PCRE2_ERROR_INTERNAL: |
| return _("internal error"); |
| case PCRE2_ERROR_DFA_UITEM: |
| return _("the pattern contains items not supported for partial matching"); |
| case PCRE2_ERROR_DFA_UCOND: |
| return _("back references as conditions are not supported for partial matching"); |
| case PCRE2_ERROR_DFA_WSSIZE: |
| /* handled expanding the workspace */ |
| break; |
| case PCRE2_ERROR_DFA_RECURSE: |
| case PCRE2_ERROR_RECURSIONLIMIT: |
| return _("recursion limit reached"); |
| case PCRE2_ERROR_BADOFFSET: |
| return _("bad offset"); |
| case PCRE2_ERROR_RECURSELOOP: |
| return _("recursion loop"); |
| case PCRE2_ERROR_JIT_BADOPTION: |
| /* should not happen in GRegex since we check modes before each match */ |
| return _("matching mode is requested that was not compiled for JIT"); |
| default: |
| break; |
| } |
| return NULL; |
| } |
| |
| static char * |
| get_match_error_message (int errcode) |
| { |
| const char *msg = translate_match_error (errcode); |
| char *error_string; |
| |
| if (msg) |
| return g_strdup (msg); |
| |
| error_string = get_pcre2_error_string (errcode); |
| |
| if (error_string) |
| return error_string; |
| |
| return g_strdup (_("unknown error")); |
| } |
| |
| static void |
| translate_compile_error (gint *errcode, const gchar **errmsg) |
| { |
| /* If errcode is known we put the translatable error message in |
| * errmsg. If errcode is unknown we put the generic |
| * G_REGEX_ERROR_COMPILE error code in errcode. |
| * Note that there can be more PCRE errors with the same GRegexError |
| * and that some PCRE errors are useless for us. |
| */ |
| gint original_errcode = *errcode; |
| |
| *errcode = -1; |
| *errmsg = NULL; |
| |
| switch (original_errcode) |
| { |
| case PCRE2_ERROR_END_BACKSLASH: |
| *errcode = G_REGEX_ERROR_STRAY_BACKSLASH; |
| *errmsg = _("\\ at end of pattern"); |
| break; |
| case PCRE2_ERROR_END_BACKSLASH_C: |
| *errcode = G_REGEX_ERROR_MISSING_CONTROL_CHAR; |
| *errmsg = _("\\c at end of pattern"); |
| break; |
| case PCRE2_ERROR_UNKNOWN_ESCAPE: |
| case PCRE2_ERROR_UNSUPPORTED_ESCAPE_SEQUENCE: |
| *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; |
| *errmsg = _("unrecognized character following \\"); |
| break; |
| case PCRE2_ERROR_QUANTIFIER_OUT_OF_ORDER: |
| *errcode = G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER; |
| *errmsg = _("numbers out of order in {} quantifier"); |
| break; |
| case PCRE2_ERROR_QUANTIFIER_TOO_BIG: |
| *errcode = G_REGEX_ERROR_QUANTIFIER_TOO_BIG; |
| *errmsg = _("number too big in {} quantifier"); |
| break; |
| case PCRE2_ERROR_MISSING_SQUARE_BRACKET: |
| *errcode = G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS; |
| *errmsg = _("missing terminating ] for character class"); |
| break; |
| case PCRE2_ERROR_ESCAPE_INVALID_IN_CLASS: |
| *errcode = G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS; |
| *errmsg = _("invalid escape sequence in character class"); |
| break; |
| case PCRE2_ERROR_CLASS_RANGE_ORDER: |
| *errcode = G_REGEX_ERROR_RANGE_OUT_OF_ORDER; |
| *errmsg = _("range out of order in character class"); |
| break; |
| case PCRE2_ERROR_QUANTIFIER_INVALID: |
| case PCRE2_ERROR_INTERNAL_UNEXPECTED_REPEAT: |
| *errcode = G_REGEX_ERROR_NOTHING_TO_REPEAT; |
| *errmsg = _("nothing to repeat"); |
| break; |
| case PCRE2_ERROR_INVALID_AFTER_PARENS_QUERY: |
| *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
| *errmsg = _("unrecognized character after (? or (?-"); |
| break; |
| case PCRE2_ERROR_POSIX_CLASS_NOT_IN_CLASS: |
| *errcode = G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS; |
| *errmsg = _("POSIX named classes are supported only within a class"); |
| break; |
| case PCRE2_ERROR_POSIX_NO_SUPPORT_COLLATING: |
| *errcode = G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED; |
| *errmsg = _("POSIX collating elements are not supported"); |
| break; |
| case PCRE2_ERROR_MISSING_CLOSING_PARENTHESIS: |
| case PCRE2_ERROR_UNMATCHED_CLOSING_PARENTHESIS: |
| case PCRE2_ERROR_PARENS_QUERY_R_MISSING_CLOSING: |
| *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; |
| *errmsg = _("missing terminating )"); |
| break; |
| case PCRE2_ERROR_BAD_SUBPATTERN_REFERENCE: |
| *errcode = G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE; |
| *errmsg = _("reference to non-existent subpattern"); |
| break; |
| case PCRE2_ERROR_MISSING_COMMENT_CLOSING: |
| *errcode = G_REGEX_ERROR_UNTERMINATED_COMMENT; |
| *errmsg = _("missing ) after comment"); |
| break; |
| case PCRE2_ERROR_PATTERN_TOO_LARGE: |
| *errcode = G_REGEX_ERROR_EXPRESSION_TOO_LARGE; |
| *errmsg = _("regular expression is too large"); |
| break; |
| case PCRE2_ERROR_MISSING_CONDITION_CLOSING: |
| *errcode = G_REGEX_ERROR_MALFORMED_CONDITION; |
| *errmsg = _("malformed number or name after (?("); |
| break; |
| case PCRE2_ERROR_LOOKBEHIND_NOT_FIXED_LENGTH: |
| *errcode = G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND; |
| *errmsg = _("lookbehind assertion is not fixed length"); |
| break; |
| case PCRE2_ERROR_TOO_MANY_CONDITION_BRANCHES: |
| *errcode = G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES; |
| *errmsg = _("conditional group contains more than two branches"); |
| break; |
| case PCRE2_ERROR_CONDITION_ASSERTION_EXPECTED: |
| *errcode = G_REGEX_ERROR_ASSERTION_EXPECTED; |
| *errmsg = _("assertion expected after (?("); |
| break; |
| case PCRE2_ERROR_BAD_RELATIVE_REFERENCE: |
| *errcode = G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE; |
| *errmsg = _("a numbered reference must not be zero"); |
| break; |
| case PCRE2_ERROR_UNKNOWN_POSIX_CLASS: |
| *errcode = G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME; |
| *errmsg = _("unknown POSIX class name"); |
| break; |
| case PCRE2_ERROR_CODE_POINT_TOO_BIG: |
| case PCRE2_ERROR_INVALID_HEXADECIMAL: |
| *errcode = G_REGEX_ERROR_HEX_CODE_TOO_LARGE; |
| *errmsg = _("character value in \\x{...} sequence is too large"); |
| break; |
| case PCRE2_ERROR_LOOKBEHIND_INVALID_BACKSLASH_C: |
| *errcode = G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND; |
| *errmsg = _("\\C not allowed in lookbehind assertion"); |
| break; |
| case PCRE2_ERROR_MISSING_NAME_TERMINATOR: |
| *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR; |
| *errmsg = _("missing terminator in subpattern name"); |
| break; |
| case PCRE2_ERROR_DUPLICATE_SUBPATTERN_NAME: |
| *errcode = G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME; |
| *errmsg = _("two named subpatterns have the same name"); |
| break; |
| case PCRE2_ERROR_MALFORMED_UNICODE_PROPERTY: |
| *errcode = G_REGEX_ERROR_MALFORMED_PROPERTY; |
| *errmsg = _("malformed \\P or \\p sequence"); |
| break; |
| case PCRE2_ERROR_UNKNOWN_UNICODE_PROPERTY: |
| *errcode = G_REGEX_ERROR_UNKNOWN_PROPERTY; |
| *errmsg = _("unknown property name after \\P or \\p"); |
| break; |
| case PCRE2_ERROR_SUBPATTERN_NAME_TOO_LONG: |
| *errcode = G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG; |
| *errmsg = _("subpattern name is too long (maximum 32 characters)"); |
| break; |
| case PCRE2_ERROR_TOO_MANY_NAMED_SUBPATTERNS: |
| *errcode = G_REGEX_ERROR_TOO_MANY_SUBPATTERNS; |
| *errmsg = _("too many named subpatterns (maximum 10,000)"); |
| break; |
| case PCRE2_ERROR_OCTAL_BYTE_TOO_BIG: |
| *errcode = G_REGEX_ERROR_INVALID_OCTAL_VALUE; |
| *errmsg = _("octal value is greater than \\377"); |
| break; |
| case PCRE2_ERROR_DEFINE_TOO_MANY_BRANCHES: |
| *errcode = G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE; |
| *errmsg = _("DEFINE group contains more than one branch"); |
| break; |
| case PCRE2_ERROR_INTERNAL_UNKNOWN_NEWLINE: |
| *errcode = G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS; |
| *errmsg = _("inconsistent NEWLINE options"); |
| break; |
| case PCRE2_ERROR_BACKSLASH_G_SYNTAX: |
| *errcode = G_REGEX_ERROR_MISSING_BACK_REFERENCE; |
| *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " |
| "number, or by a plain number"); |
| break; |
| case PCRE2_ERROR_VERB_ARGUMENT_NOT_ALLOWED: |
| *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN; |
| *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)"); |
| break; |
| case PCRE2_ERROR_VERB_UNKNOWN: |
| *errcode = G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB; |
| *errmsg = _("(*VERB) not recognized"); |
| break; |
| case PCRE2_ERROR_SUBPATTERN_NUMBER_TOO_BIG: |
| *errcode = G_REGEX_ERROR_NUMBER_TOO_BIG; |
| *errmsg = _("number is too big"); |
| break; |
| case PCRE2_ERROR_SUBPATTERN_NAME_EXPECTED: |
| *errcode = G_REGEX_ERROR_MISSING_SUBPATTERN_NAME; |
| *errmsg = _("missing subpattern name after (?&"); |
| break; |
| case PCRE2_ERROR_SUBPATTERN_NAMES_MISMATCH: |
| *errcode = G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME; |
| *errmsg = _("different names for subpatterns of the same number are not allowed"); |
| break; |
| case PCRE2_ERROR_MARK_MISSING_ARGUMENT: |
| *errcode = G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED; |
| *errmsg = _("(*MARK) must have an argument"); |
| break; |
| case PCRE2_ERROR_BACKSLASH_C_SYNTAX: |
| *errcode = G_REGEX_ERROR_INVALID_CONTROL_CHAR; |
| *errmsg = _( "\\c must be followed by an ASCII character"); |
| break; |
| case PCRE2_ERROR_BACKSLASH_K_SYNTAX: |
| *errcode = G_REGEX_ERROR_MISSING_NAME; |
| *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name"); |
| break; |
| case PCRE2_ERROR_BACKSLASH_N_IN_CLASS: |
| *errcode = G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS; |
| *errmsg = _("\\N is not supported in a class"); |
| break; |
| case PCRE2_ERROR_VERB_NAME_TOO_LONG: |
| *errcode = G_REGEX_ERROR_NAME_TOO_LONG; |
| *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)"); |
| break; |
| case PCRE2_ERROR_INTERNAL_CODE_OVERFLOW: |
| *errcode = G_REGEX_ERROR_INTERNAL; |
| *errmsg = _("code overflow"); |
| break; |
| case PCRE2_ERROR_UNRECOGNIZED_AFTER_QUERY_P: |
| *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
| *errmsg = _("unrecognized character after (?P"); |
| break; |
| case PCRE2_ERROR_INTERNAL_OVERRAN_WORKSPACE: |
| *errcode = G_REGEX_ERROR_INTERNAL; |
| *errmsg = _("overran compiling workspace"); |
| break; |
| case PCRE2_ERROR_INTERNAL_MISSING_SUBPATTERN: |
| *errcode = G_REGEX_ERROR_INTERNAL; |
| *errmsg = _("previously-checked referenced subpattern not found"); |
| break; |
| case PCRE2_ERROR_HEAP_FAILED: |
| case PCRE2_ERROR_INTERNAL_PARSED_OVERFLOW: |
| case PCRE2_ERROR_UNICODE_NOT_SUPPORTED: |
| case PCRE2_ERROR_UNICODE_DISALLOWED_CODE_POINT: |
| case PCRE2_ERROR_NO_SURROGATES_IN_UTF16: |
| case PCRE2_ERROR_INTERNAL_BAD_CODE_LOOKBEHINDS: |
| case PCRE2_ERROR_UNICODE_PROPERTIES_UNAVAILABLE: |
| case PCRE2_ERROR_INTERNAL_STUDY_ERROR: |
| case PCRE2_ERROR_UTF_IS_DISABLED: |
| case PCRE2_ERROR_UCP_IS_DISABLED: |
| case PCRE2_ERROR_INTERNAL_BAD_CODE_AUTO_POSSESS: |
| case PCRE2_ERROR_BACKSLASH_C_LIBRARY_DISABLED: |
| case PCRE2_ERROR_INTERNAL_BAD_CODE: |
| case PCRE2_ERROR_INTERNAL_BAD_CODE_IN_SKIP: |
| *errcode = G_REGEX_ERROR_INTERNAL; |
| break; |
| case PCRE2_ERROR_INVALID_SUBPATTERN_NAME: |
| case PCRE2_ERROR_CLASS_INVALID_RANGE: |
| case PCRE2_ERROR_ZERO_RELATIVE_REFERENCE: |
| case PCRE2_ERROR_PARENTHESES_STACK_CHECK: |
| case PCRE2_ERROR_LOOKBEHIND_TOO_COMPLICATED: |
| case PCRE2_ERROR_CALLOUT_NUMBER_TOO_BIG: |
| case PCRE2_ERROR_MISSING_CALLOUT_CLOSING: |
| case PCRE2_ERROR_ESCAPE_INVALID_IN_VERB: |
| case PCRE2_ERROR_NULL_PATTERN: |
| case PCRE2_ERROR_BAD_OPTIONS: |
| case PCRE2_ERROR_PARENTHESES_NEST_TOO_DEEP: |
| case PCRE2_ERROR_BACKSLASH_O_MISSING_BRACE: |
| case PCRE2_ERROR_INVALID_OCTAL: |
| case PCRE2_ERROR_CALLOUT_STRING_TOO_LONG: |
| case PCRE2_ERROR_BACKSLASH_U_CODE_POINT_TOO_BIG: |
| case PCRE2_ERROR_MISSING_OCTAL_OR_HEX_DIGITS: |
| case PCRE2_ERROR_VERSION_CONDITION_SYNTAX: |
| case PCRE2_ERROR_CALLOUT_NO_STRING_DELIMITER: |
| case PCRE2_ERROR_CALLOUT_BAD_STRING_DELIMITER: |
| case PCRE2_ERROR_BACKSLASH_C_CALLER_DISABLED: |
| case PCRE2_ERROR_QUERY_BARJX_NEST_TOO_DEEP: |
| case PCRE2_ERROR_PATTERN_TOO_COMPLICATED: |
| case PCRE2_ERROR_LOOKBEHIND_TOO_LONG: |
| case PCRE2_ERROR_PATTERN_STRING_TOO_LONG: |
| case PCRE2_ERROR_BAD_LITERAL_OPTIONS: |
| default: |
| *errcode = G_REGEX_ERROR_COMPILE; |
| break; |
| } |
| |
| g_assert (*errcode != -1); |
| } |
| |
| /* GMatchInfo */ |
| |
| static GMatchInfo * |
| match_info_new (const GRegex *regex, |
| const gchar *string, |
| gint string_len, |
| gint start_position, |
| GRegexMatchFlags match_options, |
| gboolean is_dfa) |
| { |
| GMatchInfo *match_info; |
| |
| if (string_len < 0) |
| string_len = strlen (string); |
| |
| match_info = g_new0 (GMatchInfo, 1); |
| match_info->ref_count = 1; |
| match_info->regex = g_regex_ref ((GRegex *)regex); |
| match_info->string = string; |
| match_info->string_len = string_len; |
| match_info->matches = PCRE2_ERROR_NOMATCH; |
| match_info->pos = start_position; |
| match_info->match_opts = |
| get_pcre2_match_options (match_options, regex->orig_compile_opts); |
| |
| pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, |
| &match_info->n_subpatterns); |
| |
| match_info->match_context = pcre2_match_context_create (NULL); |
| |
| if (is_dfa) |
| { |
| /* These values should be enough for most cases, if they are not |
| * enough g_regex_match_all_full() will expand them. */ |
| match_info->n_workspace = 100; |
| match_info->workspace = g_new (gint, match_info->n_workspace); |
| } |
| |
| match_info->n_offsets = 2; |
| match_info->offsets = g_new0 (gint, match_info->n_offsets); |
| /* Set an invalid position for the previous match. */ |
| match_info->offsets[0] = -1; |
| match_info->offsets[1] = -1; |
| |
| match_info->match_data = pcre2_match_data_create_from_pattern ( |
| match_info->regex->pcre_re, |
| NULL); |
| |
| return match_info; |
| } |
| |
| static gboolean |
| recalc_match_offsets (GMatchInfo *match_info, |
| GError **error) |
| { |
| PCRE2_SIZE *ovector; |
| uint32_t ovector_size = 0; |
| uint32_t pre_n_offset; |
| uint32_t i; |
| |
| g_assert (!IS_PCRE2_ERROR (match_info->matches)); |
| |
| if (match_info->matches == PCRE2_ERROR_PARTIAL) |
| ovector_size = 1; |
| else if (match_info->matches > 0) |
| ovector_size = match_info->matches; |
| |
| g_assert (ovector_size != 0); |
| |
| if (pcre2_get_ovector_count (match_info->match_data) < ovector_size) |
| { |
| g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
| _("Error while matching regular expression %s: %s"), |
| match_info->regex->pattern, _("code overflow")); |
| return FALSE; |
| } |
| |
| pre_n_offset = match_info->n_offsets; |
| match_info->n_offsets = ovector_size * 2; |
| ovector = pcre2_get_ovector_pointer (match_info->match_data); |
| |
| if (match_info->n_offsets != pre_n_offset) |
| { |
| match_info->offsets = g_realloc_n (match_info->offsets, |
| match_info->n_offsets, |
| sizeof (gint)); |
| } |
| |
| for (i = 0; i < match_info->n_offsets; i++) |
| { |
| match_info->offsets[i] = (int) ovector[i]; |
| } |
| |
| return TRUE; |
| } |
| |
| static JITStatus |
| enable_jit_with_match_options (GMatchInfo *match_info, |
| uint32_t match_options) |
| { |
| gint retval; |
| uint32_t old_jit_options, new_jit_options; |
| |
| if (!(match_info->regex->orig_compile_opts & G_REGEX_OPTIMIZE)) |
| return JIT_STATUS_DISABLED; |
| |
| if (match_info->regex->jit_status == JIT_STATUS_DISABLED) |
| return JIT_STATUS_DISABLED; |
| |
| if (match_options & G_REGEX_PCRE2_JIT_UNSUPPORTED_OPTIONS) |
| return JIT_STATUS_DISABLED; |
| |
| old_jit_options = match_info->regex->jit_options; |
| new_jit_options = old_jit_options | PCRE2_JIT_COMPLETE; |
| if (match_options & PCRE2_PARTIAL_HARD) |
| new_jit_options |= PCRE2_JIT_PARTIAL_HARD; |
| if (match_options & PCRE2_PARTIAL_SOFT) |
| new_jit_options |= PCRE2_JIT_PARTIAL_SOFT; |
| |
| /* no new options enabled */ |
| if (new_jit_options == old_jit_options) |
| { |
| g_assert (match_info->regex->jit_status != JIT_STATUS_DEFAULT); |
| return match_info->regex->jit_status; |
| } |
| |
| retval = pcre2_jit_compile (match_info->regex->pcre_re, new_jit_options); |
| if (retval == 0) |
| { |
| match_info->regex->jit_status = JIT_STATUS_ENABLED; |
| |
| match_info->regex->jit_options = new_jit_options; |
| /* Set min stack size for JIT to 32KiB and max to 512KiB */ |
| match_info->jit_stack = pcre2_jit_stack_create (1 << 15, 1 << 19, NULL); |
| pcre2_jit_stack_assign (match_info->match_context, NULL, match_info->jit_stack); |
| } |
| else |
| { |
| match_info->regex->jit_status = JIT_STATUS_DISABLED; |
| |
| switch (retval) |
| { |
| case PCRE2_ERROR_NOMEMORY: |
| g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
| "but JIT was unable to allocate executable memory for the " |
| "compiler. Falling back to interpretive code."); |
| break; |
| case PCRE2_ERROR_JIT_BADOPTION: |
| g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
| "but JIT support is not available. Falling back to " |
| "interpretive code."); |
| break; |
| default: |
| g_debug ("JIT compilation was requested with G_REGEX_OPTIMIZE, " |
| "but request for JIT support had unexpectedly failed (error %d). " |
| "Falling back to interpretive code.", |
| retval); |
| break; |
| } |
| } |
| |
| return match_info->regex->jit_status; |
| |
| g_assert_not_reached (); |
| } |
| |
| /** |
| * g_match_info_get_regex: |
| * @match_info: a #GMatchInfo |
| * |
| * Returns #GRegex object used in @match_info. It belongs to Glib |
| * and must not be freed. Use g_regex_ref() if you need to keep it |
| * after you free @match_info object. |
| * |
| * Returns: (transfer none): #GRegex object used in @match_info |
| * |
| * Since: 2.14 |
| */ |
| GRegex * |
| g_match_info_get_regex (const GMatchInfo *match_info) |
| { |
| g_return_val_if_fail (match_info != NULL, NULL); |
| return match_info->regex; |
| } |
| |
| /** |
| * g_match_info_get_string: |
| * @match_info: a #GMatchInfo |
| * |
| * Returns the string searched with @match_info. This is the |
| * string passed to g_regex_match() or g_regex_replace() so |
| * you may not free it before calling this function. |
| * |
| * Returns: the string searched with @match_info |
| * |
| * Since: 2.14 |
| */ |
| const gchar * |
| g_match_info_get_string (const GMatchInfo *match_info) |
| { |
| g_return_val_if_fail (match_info != NULL, NULL); |
| return match_info->string; |
| } |
| |
| /** |
| * g_match_info_ref: |
| * @match_info: a #GMatchInfo |
| * |
| * Increases reference count of @match_info by 1. |
| * |
| * Returns: @match_info |
| * |
| * Since: 2.30 |
| */ |
| GMatchInfo * |
| g_match_info_ref (GMatchInfo *match_info) |
| { |
| g_return_val_if_fail (match_info != NULL, NULL); |
| g_atomic_int_inc (&match_info->ref_count); |
| return match_info; |
| } |
| |
| /** |
| * g_match_info_unref: |
| * @match_info: a #GMatchInfo |
| * |
| * Decreases reference count of @match_info by 1. When reference count drops |
| * to zero, it frees all the memory associated with the match_info structure. |
| * |
| * Since: 2.30 |
| */ |
| void |
| g_match_info_unref (GMatchInfo *match_info) |
| { |
| if (g_atomic_int_dec_and_test (&match_info->ref_count)) |
| { |
| g_regex_unref (match_info->regex); |
| if (match_info->match_context) |
| pcre2_match_context_free (match_info->match_context); |
| if (match_info->jit_stack) |
| pcre2_jit_stack_free (match_info->jit_stack); |
| if (match_info->match_data) |
| pcre2_match_data_free (match_info->match_data); |
| g_free (match_info->offsets); |
| g_free (match_info->workspace); |
| g_free (match_info); |
| } |
| } |
| |
| /** |
| * g_match_info_free: |
| * @match_info: (nullable): a #GMatchInfo, or %NULL |
| * |
| * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does |
| * nothing. |
| * |
| * Since: 2.14 |
| */ |
| void |
| g_match_info_free (GMatchInfo *match_info) |
| { |
| if (match_info == NULL) |
| return; |
| |
| g_match_info_unref (match_info); |
| } |
| |
| /** |
| * g_match_info_next: |
| * @match_info: a #GMatchInfo structure |
| * @error: location to store the error occurring, or %NULL to ignore errors |
| * |
| * Scans for the next match using the same parameters of the previous |
| * call to g_regex_match_full() or g_regex_match() that returned |
| * @match_info. |
| * |
| * The match is done on the string passed to the match function, so you |
| * cannot free it before calling this function. |
| * |
| * Returns: %TRUE is the string matched, %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_match_info_next (GMatchInfo *match_info, |
| GError **error) |
| { |
| JITStatus jit_status; |
| gint prev_match_start; |
| gint prev_match_end; |
| uint32_t opts; |
| |
| g_return_val_if_fail (match_info != NULL, FALSE); |
| g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
| g_return_val_if_fail (match_info->pos >= 0, FALSE); |
| |
| prev_match_start = match_info->offsets[0]; |
| prev_match_end = match_info->offsets[1]; |
| |
| if (match_info->pos > match_info->string_len) |
| { |
| /* we have reached the end of the string */ |
| match_info->pos = -1; |
| match_info->matches = PCRE2_ERROR_NOMATCH; |
| return FALSE; |
| } |
| |
| opts = match_info->regex->match_opts | match_info->match_opts; |
| |
| jit_status = enable_jit_with_match_options (match_info, opts); |
| if (jit_status == JIT_STATUS_ENABLED) |
| { |
| match_info->matches = pcre2_jit_match (match_info->regex->pcre_re, |
| (PCRE2_SPTR8) match_info->string, |
| match_info->string_len, |
| match_info->pos, |
| opts, |
| match_info->match_data, |
| match_info->match_context); |
| /* if the JIT stack limit was reached, fall back to non-JIT matching in |
| * the next conditional statement */ |
| if (match_info->matches == PCRE2_ERROR_JIT_STACKLIMIT) |
| { |
| g_debug ("PCRE2 JIT stack limit reached, falling back to " |
| "non-optimized matching."); |
| opts |= PCRE2_NO_JIT; |
| jit_status = JIT_STATUS_DISABLED; |
| } |
| } |
| |
| if (jit_status != JIT_STATUS_ENABLED) |
| { |
| match_info->matches = pcre2_match (match_info->regex->pcre_re, |
| (PCRE2_SPTR8) match_info->string, |
| match_info->string_len, |
| match_info->pos, |
| opts, |
| match_info->match_data, |
| match_info->match_context); |
| } |
| |
| if (IS_PCRE2_ERROR (match_info->matches)) |
| { |
| gchar *error_msg = get_match_error_message (match_info->matches); |
| |
| g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
| _("Error while matching regular expression %s: %s"), |
| match_info->regex->pattern, error_msg); |
| g_clear_pointer (&error_msg, g_free); |
| return FALSE; |
| } |
| else if (match_info->matches == 0) |
| { |
| /* info->offsets is too small. */ |
| match_info->n_offsets *= 2; |
| match_info->offsets = g_realloc_n (match_info->offsets, |
| match_info->n_offsets, |
| sizeof (gint)); |
| |
| pcre2_match_data_free (match_info->match_data); |
| match_info->match_data = pcre2_match_data_create (match_info->n_offsets, NULL); |
| |
| return g_match_info_next (match_info, error); |
| } |
| else if (match_info->matches == PCRE2_ERROR_NOMATCH) |
| { |
| /* We're done with this match info */ |
| match_info->pos = -1; |
| return FALSE; |
| } |
| else |
| if (!recalc_match_offsets (match_info, error)) |
| return FALSE; |
| |
| /* avoid infinite loops if the pattern is an empty string or something |
| * equivalent */ |
| if (match_info->pos == match_info->offsets[1]) |
| { |
| if (match_info->pos > match_info->string_len) |
| { |
| /* we have reached the end of the string */ |
| match_info->pos = -1; |
| match_info->matches = PCRE2_ERROR_NOMATCH; |
| return FALSE; |
| } |
| |
| match_info->pos = NEXT_CHAR (match_info->regex, |
| &match_info->string[match_info->pos]) - |
| match_info->string; |
| } |
| else |
| { |
| match_info->pos = match_info->offsets[1]; |
| } |
| |
| g_assert (match_info->matches < 0 || |
| (uint32_t) match_info->matches <= match_info->n_subpatterns + 1); |
| |
| /* it's possible to get two identical matches when we are matching |
| * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and |
| * the string is "RegExTest" we have: |
| * - search at position 0: match from 0 to 0 |
| * - search at position 1: match from 3 to 3 |
| * - search at position 3: match from 3 to 3 (duplicate) |
| * - search at position 4: match from 5 to 5 |
| * - search at position 5: match from 5 to 5 (duplicate) |
| * - search at position 6: no match -> stop |
| * so we have to ignore the duplicates. |
| * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ |
| if (match_info->matches >= 0 && |
| prev_match_start == match_info->offsets[0] && |
| prev_match_end == match_info->offsets[1]) |
| { |
| /* ignore this match and search the next one */ |
| return g_match_info_next (match_info, error); |
| } |
| |
| return match_info->matches >= 0; |
| } |
| |
| /** |
| * g_match_info_matches: |
| * @match_info: a #GMatchInfo structure |
| * |
| * Returns whether the previous match operation succeeded. |
| * |
| * Returns: %TRUE if the previous match operation succeeded, |
| * %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_match_info_matches (const GMatchInfo *match_info) |
| { |
| g_return_val_if_fail (match_info != NULL, FALSE); |
| |
| return match_info->matches >= 0; |
| } |
| |
| /** |
| * g_match_info_get_match_count: |
| * @match_info: a #GMatchInfo structure |
| * |
| * Retrieves the number of matched substrings (including substring 0, |
| * that is the whole matched text), so 1 is returned if the pattern |
| * has no substrings in it and 0 is returned if the match failed. |
| * |
| * If the last match was obtained using the DFA algorithm, that is |
| * using g_regex_match_all() or g_regex_match_all_full(), the retrieved |
| * count is not that of the number of capturing parentheses but that of |
| * the number of matched substrings. |
| * |
| * Returns: Number of matched substrings, or -1 if an error occurred |
| * |
| * Since: 2.14 |
| */ |
| gint |
| g_match_info_get_match_count (const GMatchInfo *match_info) |
| { |
| g_return_val_if_fail (match_info, -1); |
| |
| if (match_info->matches == PCRE2_ERROR_NOMATCH) |
| /* no match */ |
| return 0; |
| else if (match_info->matches < PCRE2_ERROR_NOMATCH) |
| /* error */ |
| return -1; |
| else |
| /* match */ |
| return match_info->matches; |
| } |
| |
| /** |
| * g_match_info_is_partial_match: |
| * @match_info: a #GMatchInfo structure |
| * |
| * Usually if the string passed to g_regex_match*() matches as far as |
| * it goes, but is too short to match the entire pattern, %FALSE is |
| * returned. There are circumstances where it might be helpful to |
| * distinguish this case from other cases in which there is no match. |
| * |
| * Consider, for example, an application where a human is required to |
| * type in data for a field with specific formatting requirements. An |
| * example might be a date in the form ddmmmyy, defined by the pattern |
| * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". |
| * If the application sees the user’s keystrokes one by one, and can |
| * check that what has been typed so far is potentially valid, it is |
| * able to raise an error as soon as a mistake is made. |
| * |
| * GRegex supports the concept of partial matching by means of the |
| * %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD flags. |
| * When they are used, the return code for |
| * g_regex_match() or g_regex_match_full() is, as usual, %TRUE |
| * for a complete match, %FALSE otherwise. But, when these functions |
| * return %FALSE, you can check if the match was partial calling |
| * g_match_info_is_partial_match(). |
| * |
| * The difference between %G_REGEX_MATCH_PARTIAL_SOFT and |
| * %G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered |
| * with %G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a |
| * possible complete match, while with %G_REGEX_MATCH_PARTIAL_HARD matching |
| * stops at the partial match. |
| * When both %G_REGEX_MATCH_PARTIAL_SOFT and %G_REGEX_MATCH_PARTIAL_HARD |
| * are set, the latter takes precedence. |
| * |
| * There were formerly some restrictions on the pattern for partial matching. |
| * The restrictions no longer apply. |
| * |
| * See pcrepartial(3) for more information on partial matching. |
| * |
| * Returns: %TRUE if the match was partial, %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_match_info_is_partial_match (const GMatchInfo *match_info) |
| { |
| g_return_val_if_fail (match_info != NULL, FALSE); |
| |
| return match_info->matches == PCRE2_ERROR_PARTIAL; |
| } |
| |
| /** |
| * g_match_info_expand_references: |
| * @match_info: (nullable): a #GMatchInfo or %NULL |
| * @string_to_expand: the string to expand |
| * @error: location to store the error occurring, or %NULL to ignore errors |
| * |
| * Returns a new string containing the text in @string_to_expand with |
| * references and escape sequences expanded. References refer to the last |
| * match done with @string against @regex and have the same syntax used by |
| * g_regex_replace(). |
| * |
| * The @string_to_expand must be UTF-8 encoded even if %G_REGEX_RAW was |
| * passed to g_regex_new(). |
| * |
| * The backreferences are extracted from the string passed to the match |
| * function, so you cannot call this function after freeing the string. |
| * |
| * @match_info may be %NULL in which case @string_to_expand must not |
| * contain references. For instance "foo\n" does not refer to an actual |
| * pattern and '\n' merely will be replaced with \n character, |
| * while to expand "\0" (whole match) one needs the result of a match. |
| * Use g_regex_check_replacement() to find out whether @string_to_expand |
| * contains references. |
| * |
| * Returns: (nullable): the expanded string, or %NULL if an error occurred |
| * |
| * Since: 2.14 |
| */ |
| gchar * |
| g_match_info_expand_references (const GMatchInfo *match_info, |
| const gchar *string_to_expand, |
| GError **error) |
| { |
| GString *result; |
| GList *list; |
| GError *tmp_error = NULL; |
| |
| g_return_val_if_fail (string_to_expand != NULL, NULL); |
| g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
| |
| list = split_replacement (string_to_expand, &tmp_error); |
| if (tmp_error != NULL) |
| { |
| g_propagate_error (error, tmp_error); |
| return NULL; |
| } |
| |
| if (!match_info && interpolation_list_needs_match (list)) |
| { |
| g_critical ("String '%s' contains references to the match, can't " |
| "expand references without GMatchInfo object", |
| string_to_expand); |
| return NULL; |
| } |
| |
| result = g_string_sized_new (strlen (string_to_expand)); |
| interpolate_replacement (match_info, result, list); |
| |
| g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
| |
| return g_string_free (result, FALSE); |
| } |
| |
| /** |
| * g_match_info_fetch: |
| * @match_info: #GMatchInfo structure |
| * @match_num: number of the sub expression |
| * |
| * Retrieves the text matching the @match_num'th capturing |
| * parentheses. 0 is the full text of the match, 1 is the first paren |
| * set, 2 the second, and so on. |
| * |
| * If @match_num is a valid sub pattern but it didn't match anything |
| * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty |
| * string is returned. |
| * |
| * If the match was obtained using the DFA algorithm, that is using |
| * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
| * string is not that of a set of parentheses but that of a matched |
| * substring. Substrings are matched in reverse order of length, so |
| * 0 is the longest match. |
| * |
| * The string is fetched from the string passed to the match function, |
| * so you cannot call this function after freeing the string. |
| * |
| * Returns: (nullable): The matched substring, or %NULL if an error |
| * occurred. You have to free the string yourself |
| * |
| * Since: 2.14 |
| */ |
| gchar * |
| g_match_info_fetch (const GMatchInfo *match_info, |
| gint match_num) |
| { |
| gchar *match = NULL; |
| gint start, end; |
| |
| g_return_val_if_fail (match_info != NULL, NULL); |
| g_return_val_if_fail (match_num >= 0, NULL); |
| |
| /* match_num does not exist or it didn't matched, i.e. matching "b" |
| * against "(a)?b" then group 0 is empty. */ |
| if (!g_match_info_fetch_pos (match_info, match_num, &start, &end)) |
| match = NULL; |
| else if (start == -1) |
| match = g_strdup (""); |
| else |
| match = g_strndup (&match_info->string[start], end - start); |
| |
| return match; |
| } |
| |
| /** |
| * g_match_info_fetch_pos: |
| * @match_info: #GMatchInfo structure |
| * @match_num: number of the sub expression |
| * @start_pos: (out) (optional): pointer to location where to store |
| * the start position, or %NULL |
| * @end_pos: (out) (optional): pointer to location where to store |
| * the end position, or %NULL |
| * |
| * Retrieves the position in bytes of the @match_num'th capturing |
| * parentheses. 0 is the full text of the match, 1 is the first |
| * paren set, 2 the second, and so on. |
| * |
| * If @match_num is a valid sub pattern but it didn't match anything |
| * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos |
| * and @end_pos are set to -1 and %TRUE is returned. |
| * |
| * If the match was obtained using the DFA algorithm, that is using |
| * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
| * position is not that of a set of parentheses but that of a matched |
| * substring. Substrings are matched in reverse order of length, so |
| * 0 is the longest match. |
| * |
| * Returns: %TRUE if the position was fetched, %FALSE otherwise. If |
| * the position cannot be fetched, @start_pos and @end_pos are left |
| * unchanged |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_match_info_fetch_pos (const GMatchInfo *match_info, |
| gint match_num, |
| gint *start_pos, |
| gint *end_pos) |
| { |
| g_return_val_if_fail (match_info != NULL, FALSE); |
| g_return_val_if_fail (match_num >= 0, FALSE); |
| |
| /* check whether there was an error */ |
| if (match_info->matches < 0) |
| return FALSE; |
| |
| /* make sure the sub expression number they're requesting is less than |
| * the total number of sub expressions in the regex. When matching all |
| * (g_regex_match_all()), also compare against the number of matches */ |
| if ((uint32_t) match_num >= MAX (match_info->n_subpatterns + 1, (uint32_t) match_info->matches)) |
| return FALSE; |
| |
| if (start_pos != NULL) |
| *start_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num] : -1; |
| |
| if (end_pos != NULL) |
| *end_pos = (match_num < match_info->matches) ? match_info->offsets[2 * match_num + 1] : -1; |
| |
| return TRUE; |
| } |
| |
| /* |
| * Returns number of first matched subpattern with name @name. |
| * There may be more than one in case when DUPNAMES is used, |
| * and not all subpatterns with that name match; |
| * pcre2_substring_number_from_name() does not work in that case. |
| */ |
| static gint |
| get_matched_substring_number (const GMatchInfo *match_info, |
| const gchar *name) |
| { |
| gint entrysize; |
| PCRE2_SPTR first, last; |
| guchar *entry; |
| |
| if (!(match_info->regex->compile_opts & PCRE2_DUPNAMES)) |
| return pcre2_substring_number_from_name (match_info->regex->pcre_re, (PCRE2_SPTR8) name); |
| |
| /* This code is analogous to code from pcre2_substring.c: |
| * pcre2_substring_get_byname() */ |
| entrysize = pcre2_substring_nametable_scan (match_info->regex->pcre_re, |
| (PCRE2_SPTR8) name, |
| &first, |
| &last); |
| |
| if (entrysize <= 0) |
| return entrysize; |
| |
| for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) |
| { |
| guint n = (entry[0] << 8) + entry[1]; |
| if (n * 2 < match_info->n_offsets && match_info->offsets[n * 2] >= 0) |
| return n; |
| } |
| |
| return (first[0] << 8) + first[1]; |
| } |
| |
| /** |
| * g_match_info_fetch_named: |
| * @match_info: #GMatchInfo structure |
| * @name: name of the subexpression |
| * |
| * Retrieves the text matching the capturing parentheses named @name. |
| * |
| * If @name is a valid sub pattern name but it didn't match anything |
| * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
| * then an empty string is returned. |
| * |
| * The string is fetched from the string passed to the match function, |
| * so you cannot call this function after freeing the string. |
| * |
| * Returns: (nullable): The matched substring, or %NULL if an error |
| * occurred. You have to free the string yourself |
| * |
| * Since: 2.14 |
| */ |
| gchar * |
| g_match_info_fetch_named (const GMatchInfo *match_info, |
| const gchar *name) |
| { |
| gint num; |
| |
| g_return_val_if_fail (match_info != NULL, NULL); |
| g_return_val_if_fail (name != NULL, NULL); |
| |
| num = get_matched_substring_number (match_info, name); |
| if (num < 0) |
| return NULL; |
| else |
| return g_match_info_fetch (match_info, num); |
| } |
| |
| /** |
| * g_match_info_fetch_named_pos: |
| * @match_info: #GMatchInfo structure |
| * @name: name of the subexpression |
| * @start_pos: (out) (optional): pointer to location where to store |
| * the start position, or %NULL |
| * @end_pos: (out) (optional): pointer to location where to store |
| * the end position, or %NULL |
| * |
| * Retrieves the position in bytes of the capturing parentheses named @name. |
| * |
| * If @name is a valid sub pattern name but it didn't match anything |
| * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
| * then @start_pos and @end_pos are set to -1 and %TRUE is returned. |
| * |
| * Returns: %TRUE if the position was fetched, %FALSE otherwise. |
| * If the position cannot be fetched, @start_pos and @end_pos |
| * are left unchanged. |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_match_info_fetch_named_pos (const GMatchInfo *match_info, |
| const gchar *name, |
| gint *start_pos, |
| gint *end_pos) |
| { |
| gint num; |
| |
| g_return_val_if_fail (match_info != NULL, FALSE); |
| g_return_val_if_fail (name != NULL, FALSE); |
| |
| num = get_matched_substring_number (match_info, name); |
| if (num < 0) |
| return FALSE; |
| |
| return g_match_info_fetch_pos (match_info, num, start_pos, end_pos); |
| } |
| |
| /** |
| * g_match_info_fetch_all: |
| * @match_info: a #GMatchInfo structure |
| * |
| * Bundles up pointers to each of the matching substrings from a match |
| * and stores them in an array of gchar pointers. The first element in |
| * the returned array is the match number 0, i.e. the entire matched |
| * text. |
| * |
| * If a sub pattern didn't match anything (e.g. sub pattern 1, matching |
| * "b" against "(a)?b") then an empty string is inserted. |
| * |
| * If the last match was obtained using the DFA algorithm, that is using |
| * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
| * strings are not that matched by sets of parentheses but that of the |
| * matched substring. Substrings are matched in reverse order of length, |
| * so the first one is the longest match. |
| * |
| * The strings are fetched from the string passed to the match function, |
| * so you cannot call this function after freeing the string. |
| * |
| * Returns: (transfer full): a %NULL-terminated array of gchar * |
| * pointers. It must be freed using g_strfreev(). If the previous |
| * match failed %NULL is returned |
| * |
| * Since: 2.14 |
| */ |
| gchar ** |
| g_match_info_fetch_all (const GMatchInfo *match_info) |
| { |
| gchar **result; |
| gint i; |
| |
| g_return_val_if_fail (match_info != NULL, NULL); |
| |
| if (match_info->matches < 0) |
| return NULL; |
| |
| result = g_new (gchar *, match_info->matches + 1); |
| for (i = 0; i < match_info->matches; i++) |
| result[i] = g_match_info_fetch (match_info, i); |
| result[i] = NULL; |
| |
| return result; |
| } |
| |
| |
| /* GRegex */ |
| |
| G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) |
| |
| /** |
| * g_regex_ref: |
| * @regex: a #GRegex |
| * |
| * Increases reference count of @regex by 1. |
| * |
| * Returns: @regex |
| * |
| * Since: 2.14 |
| */ |
| GRegex * |
| g_regex_ref (GRegex *regex) |
| { |
| g_return_val_if_fail (regex != NULL, NULL); |
| g_atomic_int_inc (®ex->ref_count); |
| return regex; |
| } |
| |
| /** |
| * g_regex_unref: |
| * @regex: a #GRegex |
| * |
| * Decreases reference count of @regex by 1. When reference count drops |
| * to zero, it frees all the memory associated with the regex structure. |
| * |
| * Since: 2.14 |
| */ |
| void |
| g_regex_unref (GRegex *regex) |
| { |
| g_return_if_fail (regex != NULL); |
| |
| if (g_atomic_int_dec_and_test (®ex->ref_count)) |
| { |
| g_free (regex->pattern); |
| if (regex->pcre_re != NULL) |
| pcre2_code_free (regex->pcre_re); |
| g_free (regex); |
| } |
| } |
| |
| static pcre2_code * regex_compile (const gchar *pattern, |
| uint32_t compile_options, |
| uint32_t newline_options, |
| uint32_t bsr_options, |
| GError **error); |
| |
| static uint32_t get_pcre2_inline_compile_options (pcre2_code *re, |
| uint32_t compile_options); |
| |
| /** |
| * g_regex_new: |
| * @pattern: the regular expression |
| * @compile_options: compile options for the regular expression, or 0 |
| * @match_options: match options for the regular expression, or 0 |
| * @error: return location for a #GError |
| * |
| * Compiles the regular expression to an internal form, and does |
| * the initial setup of the #GRegex structure. |
| * |
| * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call |
| * g_regex_unref() when you are done with it |
| * |
| * Since: 2.14 |
| */ |
| GRegex * |
| g_regex_new (const gchar *pattern, |
| GRegexCompileFlags compile_options, |
| GRegexMatchFlags match_options, |
| GError **error) |
| { |
| GRegex *regex; |
| pcre2_code *re; |
| static gsize initialised = 0; |
| uint32_t pcre_compile_options; |
| uint32_t pcre_match_options; |
| uint32_t newline_options; |
| uint32_t bsr_options; |
| |
| g_return_val_if_fail (pattern != NULL, NULL); |
| g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
| G_GNUC_BEGIN_IGNORE_DEPRECATIONS |
| g_return_val_if_fail ((compile_options & ~(G_REGEX_COMPILE_MASK | |
| G_REGEX_JAVASCRIPT_COMPAT)) == 0, NULL); |
| G_GNUC_END_IGNORE_DEPRECATIONS |
| g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
| |
| if (g_once_init_enter (&initialised)) |
| { |
| int supports_utf8; |
| |
| pcre2_config (PCRE2_CONFIG_UNICODE, &supports_utf8); |
| if (!supports_utf8) |
| g_critical (_("PCRE library is compiled without UTF8 support")); |
| |
| g_once_init_leave (&initialised, supports_utf8 ? 1 : 2); |
| } |
| |
| if (G_UNLIKELY (initialised != 1)) |
| { |
| g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE, |
| _("PCRE library is compiled with incompatible options")); |
| return NULL; |
| } |
| |
| pcre_compile_options = get_pcre2_compile_options (compile_options); |
| pcre_match_options = get_pcre2_match_options (match_options, compile_options); |
| |
| newline_options = get_pcre2_newline_match_options (match_options); |
| if (newline_options == 0) |
| newline_options = get_pcre2_newline_compile_options (compile_options); |
| |
| if (newline_options == 0) |
| { |
| g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
| "Invalid newline flags"); |
| return NULL; |
| } |
| |
| bsr_options = get_pcre2_bsr_match_options (match_options); |
| if (!bsr_options) |
| bsr_options = get_pcre2_bsr_compile_options (compile_options); |
| |
| re = regex_compile (pattern, pcre_compile_options, |
| newline_options, bsr_options, error); |
| if (re == NULL) |
| return NULL; |
| |
| pcre_compile_options |= |
| get_pcre2_inline_compile_options (re, pcre_compile_options); |
| |
| regex = g_new0 (GRegex, 1); |
| regex->ref_count = 1; |
| regex->pattern = g_strdup (pattern); |
| regex->pcre_re = re; |
| regex->compile_opts = pcre_compile_options; |
| regex->orig_compile_opts = compile_options; |
| regex->match_opts = pcre_match_options; |
| regex->orig_match_opts = match_options; |
| |
| return regex; |
| } |
| |
| static pcre2_code * |
| regex_compile (const gchar *pattern, |
| uint32_t compile_options, |
| uint32_t newline_options, |
| uint32_t bsr_options, |
| GError **error) |
| { |
| pcre2_code *re; |
| pcre2_compile_context *context; |
| const gchar *errmsg; |
| PCRE2_SIZE erroffset; |
| gint errcode; |
| |
| context = pcre2_compile_context_create (NULL); |
| |
| /* set newline options */ |
| if (pcre2_set_newline (context, newline_options) != 0) |
| { |
| g_set_error (error, G_REGEX_ERROR, |
| G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
| "Invalid newline flags"); |
| pcre2_compile_context_free (context); |
| return NULL; |
| } |
| |
| /* set bsr options */ |
| if (pcre2_set_bsr (context, bsr_options) != 0) |
| { |
| g_set_error (error, G_REGEX_ERROR, |
| G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS, |
| "Invalid BSR flags"); |
| pcre2_compile_context_free (context); |
| return NULL; |
| } |
| |
| /* In case UTF-8 mode is used, also set PCRE2_NO_UTF_CHECK */ |
| if (compile_options & PCRE2_UTF) |
| compile_options |= PCRE2_NO_UTF_CHECK; |
| |
| compile_options |= PCRE2_UCP; |
| |
| /* compile the pattern */ |
| re = pcre2_compile ((PCRE2_SPTR8) pattern, |
| PCRE2_ZERO_TERMINATED, |
| compile_options, |
| &errcode, |
| &erroffset, |
| context); |
| pcre2_compile_context_free (context); |
| |
| /* if the compilation failed, set the error member and return |
| * immediately */ |
| if (re == NULL) |
| { |
| GError *tmp_error; |
| gchar *offset_str; |
| gchar *pcre2_errmsg = NULL; |
| int original_errcode; |
| |
| /* Translate the PCRE error code to GRegexError and use a translated |
| * error message if possible */ |
| original_errcode = errcode; |
| translate_compile_error (&errcode, &errmsg); |
| |
| if (!errmsg) |
| { |
| errmsg = _("unknown error"); |
| pcre2_errmsg = get_pcre2_error_string (original_errcode); |
| } |
| |
| /* PCRE uses byte offsets but we want to show character offsets */ |
| erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]); |
| |
| offset_str = g_strdup_printf ("%" G_GSIZE_FORMAT, erroffset); |
| tmp_error = g_error_new (G_REGEX_ERROR, errcode, |
| _("Error while compiling regular expression ‘%s’ " |
| "at char %s: %s"), |
| pattern, offset_str, |
| pcre2_errmsg ? pcre2_errmsg : errmsg); |
| g_propagate_error (error, tmp_error); |
| g_free (offset_str); |
| g_clear_pointer (&pcre2_errmsg, g_free); |
| |
| return NULL; |
| } |
| |
| return re; |
| } |
| |
| static uint32_t |
| get_pcre2_inline_compile_options (pcre2_code *re, |
| uint32_t compile_options) |
| { |
| uint32_t pcre_compile_options; |
| uint32_t nonpcre_compile_options; |
| |
| /* For options set at the beginning of the pattern, pcre puts them into |
| * compile options, e.g. "(?i)foo" will make the pcre structure store |
| * PCRE2_CASELESS even though it wasn't explicitly given for compilation. */ |
| nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; |
| pcre2_pattern_info (re, PCRE2_INFO_ALLOPTIONS, &pcre_compile_options); |
| compile_options = pcre_compile_options & G_REGEX_PCRE2_COMPILE_MASK; |
| compile_options |= nonpcre_compile_options; |
| |
| if (!(compile_options & PCRE2_DUPNAMES)) |
| { |
| uint32_t jchanged = 0; |
| pcre2_pattern_info (re, PCRE2_INFO_JCHANGED, &jchanged); |
| if (jchanged) |
| compile_options |= PCRE2_DUPNAMES; |
| } |
| |
| return compile_options; |
| } |
| |
| /** |
| * g_regex_get_pattern: |
| * @regex: a #GRegex structure |
| * |
| * Gets the pattern string associated with @regex, i.e. a copy of |
| * the string passed to g_regex_new(). |
| * |
| * Returns: the pattern of @regex |
| * |
| * Since: 2.14 |
| */ |
| const gchar * |
| g_regex_get_pattern (const GRegex *regex) |
| { |
| g_return_val_if_fail (regex != NULL, NULL); |
| |
| return regex->pattern; |
| } |
| |
| /** |
| * g_regex_get_max_backref: |
| * @regex: a #GRegex |
| * |
| * Returns the number of the highest back reference |
| * in the pattern, or 0 if the pattern does not contain |
| * back references. |
| * |
| * Returns: the number of the highest back reference |
| * |
| * Since: 2.14 |
| */ |
| gint |
| g_regex_get_max_backref (const GRegex *regex) |
| { |
| uint32_t value; |
| |
| pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BACKREFMAX, &value); |
| |
| return value; |
| } |
| |
| /** |
| * g_regex_get_capture_count: |
| * @regex: a #GRegex |
| * |
| * Returns the number of capturing subpatterns in the pattern. |
| * |
| * Returns: the number of capturing subpatterns |
| * |
| * Since: 2.14 |
| */ |
| gint |
| g_regex_get_capture_count (const GRegex *regex) |
| { |
| uint32_t value; |
| |
| pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_CAPTURECOUNT, &value); |
| |
| return value; |
| } |
| |
| /** |
| * g_regex_get_has_cr_or_lf: |
| * @regex: a #GRegex structure |
| * |
| * Checks whether the pattern contains explicit CR or LF references. |
| * |
| * Returns: %TRUE if the pattern contains explicit CR or LF references |
| * |
| * Since: 2.34 |
| */ |
| gboolean |
| g_regex_get_has_cr_or_lf (const GRegex *regex) |
| { |
| uint32_t value; |
| |
| pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_HASCRORLF, &value); |
| |
| return !!value; |
| } |
| |
| /** |
| * g_regex_get_max_lookbehind: |
| * @regex: a #GRegex structure |
| * |
| * Gets the number of characters in the longest lookbehind assertion in the |
| * pattern. This information is useful when doing multi-segment matching using |
| * the partial matching facilities. |
| * |
| * Returns: the number of characters in the longest lookbehind assertion. |
| * |
| * Since: 2.38 |
| */ |
| gint |
| g_regex_get_max_lookbehind (const GRegex *regex) |
| { |
| uint32_t max_lookbehind; |
| |
| pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_MAXLOOKBEHIND, |
| &max_lookbehind); |
| |
| return max_lookbehind; |
| } |
| |
| /** |
| * g_regex_get_compile_flags: |
| * @regex: a #GRegex |
| * |
| * Returns the compile options that @regex was created with. |
| * |
| * Depending on the version of PCRE that is used, this may or may not |
| * include flags set by option expressions such as `(?i)` found at the |
| * top-level within the compiled pattern. |
| * |
| * Returns: flags from #GRegexCompileFlags |
| * |
| * Since: 2.26 |
| */ |
| GRegexCompileFlags |
| g_regex_get_compile_flags (const GRegex *regex) |
| { |
| GRegexCompileFlags extra_flags; |
| uint32_t info_value; |
| |
| g_return_val_if_fail (regex != NULL, 0); |
| |
| /* Preserve original G_REGEX_OPTIMIZE */ |
| extra_flags = (regex->orig_compile_opts & G_REGEX_OPTIMIZE); |
| |
| /* Also include the newline options */ |
| pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_NEWLINE, &info_value); |
| switch (info_value) |
| { |
| case PCRE2_NEWLINE_ANYCRLF: |
| extra_flags |= G_REGEX_NEWLINE_ANYCRLF; |
| break; |
| case PCRE2_NEWLINE_CRLF: |
| extra_flags |= G_REGEX_NEWLINE_CRLF; |
| break; |
| case PCRE2_NEWLINE_LF: |
| extra_flags |= G_REGEX_NEWLINE_LF; |
| break; |
| case PCRE2_NEWLINE_CR: |
| extra_flags |= G_REGEX_NEWLINE_CR; |
| break; |
| default: |
| break; |
| } |
| |
| /* Also include the bsr options */ |
| pcre2_pattern_info (regex->pcre_re, PCRE2_INFO_BSR, &info_value); |
| switch (info_value) |
| { |
| case PCRE2_BSR_ANYCRLF: |
| extra_flags |= G_REGEX_BSR_ANYCRLF; |
| break; |
| default: |
| break; |
| } |
| |
| return g_regex_compile_flags_from_pcre2 (regex->compile_opts) | extra_flags; |
| } |
| |
| /** |
| * g_regex_get_match_flags: |
| * @regex: a #GRegex |
| * |
| * Returns the match options that @regex was created with. |
| * |
| * Returns: flags from #GRegexMatchFlags |
| * |
| * Since: 2.26 |
| */ |
| GRegexMatchFlags |
| g_regex_get_match_flags (const GRegex *regex) |
| { |
| uint32_t flags; |
| |
| g_return_val_if_fail (regex != NULL, 0); |
| |
| flags = g_regex_match_flags_from_pcre2 (regex->match_opts); |
| flags |= (regex->orig_match_opts & G_REGEX_MATCH_NEWLINE_MASK); |
| flags |= (regex->orig_match_opts & (G_REGEX_MATCH_BSR_ANY | G_REGEX_MATCH_BSR_ANYCRLF)); |
| |
| return flags; |
| } |
| |
| /** |
| * g_regex_match_simple: |
| * @pattern: the regular expression |
| * @string: the string to scan for matches |
| * @compile_options: compile options for the regular expression, or 0 |
| * @match_options: match options, or 0 |
| * |
| * Scans for a match in @string for @pattern. |
| * |
| * This function is equivalent to g_regex_match() but it does not |
| * require to compile the pattern with g_regex_new(), avoiding some |
| * lines of code when you need just to do a match without extracting |
| * substrings, capture counts, and so on. |
| * |
| * If this function is to be called on the same @pattern more than |
| * once, it's more efficient to compile the pattern once with |
| * g_regex_new() and then use g_regex_match(). |
| * |
| * Returns: %TRUE if the string matched, %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_regex_match_simple (const gchar *pattern, |
| const gchar *string, |
| GRegexCompileFlags compile_options, |
| GRegexMatchFlags match_options) |
| { |
| GRegex *regex; |
| gboolean result; |
| |
| regex = g_regex_new (pattern, compile_options, G_REGEX_MATCH_DEFAULT, NULL); |
| if (!regex) |
| return FALSE; |
| result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL); |
| g_regex_unref (regex); |
| return result; |
| } |
| |
| /** |
| * g_regex_match: |
| * @regex: a #GRegex structure from g_regex_new() |
| * @string: the string to scan for matches |
| * @match_options: match options |
| * @match_info: (out) (optional): pointer to location where to store |
| * the #GMatchInfo, or %NULL if you do not need it |
| * |
| * Scans for a match in @string for the pattern in @regex. |
| * The @match_options are combined with the match options specified |
| * when the @regex structure was created, letting you have more |
| * flexibility in reusing #GRegex structures. |
| * |
| * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
| * |
| * A #GMatchInfo structure, used to get information on the match, |
| * is stored in @match_info if not %NULL. Note that if @match_info |
| * is not %NULL then it is created even if the function returns %FALSE, |
| * i.e. you must free it regardless if regular expression actually matched. |
| * |
| * To retrieve all the non-overlapping matches of the pattern in |
| * string you can use g_match_info_next(). |
| * |
| * |[<!-- language="C" --> |
| * static void |
| * print_uppercase_words (const gchar *string) |
| * { |
| * // Print all uppercase-only words. |
| * GRegex *regex; |
| * GMatchInfo *match_info; |
| * |
| * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
| * g_regex_match (regex, string, 0, &match_info); |
| * while (g_match_info_matches (match_info)) |
| * { |
| * gchar *word = g_match_info_fetch (match_info, 0); |
| * g_print ("Found: %s\n", word); |
| * g_free (word); |
| * g_match_info_next (match_info, NULL); |
| * } |
| * g_match_info_free (match_info); |
| * g_regex_unref (regex); |
| * } |
| * ]| |
| * |
| * @string is not copied and is used in #GMatchInfo internally. If |
| * you use any #GMatchInfo method (except g_match_info_free()) after |
| * freeing or modifying @string then the behaviour is undefined. |
| * |
| * Returns: %TRUE is the string matched, %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_regex_match (const GRegex *regex, |
| const gchar *string, |
| GRegexMatchFlags match_options, |
| GMatchInfo **match_info) |
| { |
| return g_regex_match_full (regex, string, -1, 0, match_options, |
| match_info, NULL); |
| } |
| |
| /** |
| * g_regex_match_full: |
| * @regex: a #GRegex structure from g_regex_new() |
| * @string: (array length=string_len): the string to scan for matches |
| * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
| * @start_position: starting index of the string to match, in bytes |
| * @match_options: match options |
| * @match_info: (out) (optional): pointer to location where to store |
| * the #GMatchInfo, or %NULL if you do not need it |
| * @error: location to store the error occurring, or %NULL to ignore errors |
| * |
| * Scans for a match in @string for the pattern in @regex. |
| * The @match_options are combined with the match options specified |
| * when the @regex structure was created, letting you have more |
| * flexibility in reusing #GRegex structures. |
| * |
| * Setting @start_position differs from just passing over a shortened |
| * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
| * that begins with any kind of lookbehind assertion, such as "\b". |
| * |
| * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
| * |
| * A #GMatchInfo structure, used to get information on the match, is |
| * stored in @match_info if not %NULL. Note that if @match_info is |
| * not %NULL then it is created even if the function returns %FALSE, |
| * i.e. you must free it regardless if regular expression actually |
| * matched. |
| * |
| * @string is not copied and is used in #GMatchInfo internally. If |
| * you use any #GMatchInfo method (except g_match_info_free()) after |
| * freeing or modifying @string then the behaviour is undefined. |
| * |
| * To retrieve all the non-overlapping matches of the pattern in |
| * string you can use g_match_info_next(). |
| * |
| * |[<!-- language="C" --> |
| * static void |
| * print_uppercase_words (const gchar *string) |
| * { |
| * // Print all uppercase-only words. |
| * GRegex *regex; |
| * GMatchInfo *match_info; |
| * GError *error = NULL; |
| * |
| * regex = g_regex_new ("[A-Z]+", G_REGEX_DEFAULT, G_REGEX_MATCH_DEFAULT, NULL); |
| * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); |
| * while (g_match_info_matches (match_info)) |
| * { |
| * gchar *word = g_match_info_fetch (match_info, 0); |
| * g_print ("Found: %s\n", word); |
| * g_free (word); |
| * g_match_info_next (match_info, &error); |
| * } |
| * g_match_info_free (match_info); |
| * g_regex_unref (regex); |
| * if (error != NULL) |
| * { |
| * g_printerr ("Error while matching: %s\n", error->message); |
| * g_error_free (error); |
| * } |
| * } |
| * ]| |
| * |
| * Returns: %TRUE is the string matched, %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_regex_match_full (const GRegex *regex, |
| const gchar *string, |
| gssize string_len, |
| gint start_position, |
| GRegexMatchFlags match_options, |
| GMatchInfo **match_info, |
| GError **error) |
| { |
| GMatchInfo *info; |
| gboolean match_ok; |
| |
| g_return_val_if_fail (regex != NULL, FALSE); |
| g_return_val_if_fail (string != NULL, FALSE); |
| g_return_val_if_fail (start_position >= 0, FALSE); |
| g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
| g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
| |
| info = match_info_new (regex, string, string_len, start_position, |
| match_options, FALSE); |
| match_ok = g_match_info_next (info, error); |
| if (match_info != NULL) |
| *match_info = info; |
| else |
| g_match_info_free (info); |
| |
| return match_ok; |
| } |
| |
| /** |
| * g_regex_match_all: |
| * @regex: a #GRegex structure from g_regex_new() |
| * @string: the string to scan for matches |
| * @match_options: match options |
| * @match_info: (out) (optional): pointer to location where to store |
| * the #GMatchInfo, or %NULL if you do not need it |
| * |
| * Using the standard algorithm for regular expression matching only |
| * the longest match in the string is retrieved. This function uses |
| * a different algorithm so it can retrieve all the possible matches. |
| * For more documentation see g_regex_match_all_full(). |
| * |
| * A #GMatchInfo structure, used to get information on the match, is |
| * stored in @match_info if not %NULL. Note that if @match_info is |
| * not %NULL then it is created even if the function returns %FALSE, |
| * i.e. you must free it regardless if regular expression actually |
| * matched. |
| * |
| * @string is not copied and is used in #GMatchInfo internally. If |
| * you use any #GMatchInfo method (except g_match_info_free()) after |
| * freeing or modifying @string then the behaviour is undefined. |
| * |
| * Returns: %TRUE is the string matched, %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_regex_match_all (const GRegex *regex, |
| const gchar *string, |
| GRegexMatchFlags match_options, |
| GMatchInfo **match_info) |
| { |
| return g_regex_match_all_full (regex, string, -1, 0, match_options, |
| match_info, NULL); |
| } |
| |
| /** |
| * g_regex_match_all_full: |
| * @regex: a #GRegex structure from g_regex_new() |
| * @string: (array length=string_len): the string to scan for matches |
| * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
| * @start_position: starting index of the string to match, in bytes |
| * @match_options: match options |
| * @match_info: (out) (optional): pointer to location where to store |
| * the #GMatchInfo, or %NULL if you do not need it |
| * @error: location to store the error occurring, or %NULL to ignore errors |
| * |
| * Using the standard algorithm for regular expression matching only |
| * the longest match in the @string is retrieved, it is not possible |
| * to obtain all the available matches. For instance matching |
| * "<a> <b> <c>" against the pattern "<.*>" |
| * you get "<a> <b> <c>". |
| * |
| * This function uses a different algorithm (called DFA, i.e. deterministic |
| * finite automaton), so it can retrieve all the possible matches, all |
| * starting at the same point in the string. For instance matching |
| * "<a> <b> <c>" against the pattern "<.*>;" |
| * you would obtain three matches: "<a> <b> <c>", |
| * "<a> <b>" and "<a>". |
| * |
| * The number of matched strings is retrieved using |
| * g_match_info_get_match_count(). To obtain the matched strings and |
| * their position you can use, respectively, g_match_info_fetch() and |
| * g_match_info_fetch_pos(). Note that the strings are returned in |
| * reverse order of length; that is, the longest matching string is |
| * given first. |
| * |
| * Note that the DFA algorithm is slower than the standard one and it |
| * is not able to capture substrings, so backreferences do not work. |
| * |
| * Setting @start_position differs from just passing over a shortened |
| * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
| * that begins with any kind of lookbehind assertion, such as "\b". |
| * |
| * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
| * |
| * A #GMatchInfo structure, used to get information on the match, is |
| * stored in @match_info if not %NULL. Note that if @match_info is |
| * not %NULL then it is created even if the function returns %FALSE, |
| * i.e. you must free it regardless if regular expression actually |
| * matched. |
| * |
| * @string is not copied and is used in #GMatchInfo internally. If |
| * you use any #GMatchInfo method (except g_match_info_free()) after |
| * freeing or modifying @string then the behaviour is undefined. |
| * |
| * Returns: %TRUE is the string matched, %FALSE otherwise |
| * |
| * Since: 2.14 |
| */ |
| gboolean |
| g_regex_match_all_full (const GRegex *regex, |
| const gchar *string, |
| gssize string_len, |
| gint start_position, |
| GRegexMatchFlags match_options, |
| GMatchInfo **match_info, |
| GError **error) |
| { |
| GMatchInfo *info; |
| gboolean done; |
| pcre2_code *pcre_re; |
| gboolean retval; |
| uint32_t newline_options; |
| uint32_t bsr_options; |
| |
| g_return_val_if_fail (regex != NULL, FALSE); |
| g_return_val_if_fail (string != NULL, FALSE); |
| g_return_val_if_fail (start_position >= 0, FALSE); |
| g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
| g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
| |
| newline_options = get_pcre2_newline_match_options (match_options); |
| if (!newline_options) |
| newline_options = get_pcre2_newline_compile_options (regex->orig_compile_opts); |
| |
| bsr_options = get_pcre2_bsr_match_options (match_options); |
| if (!bsr_options) |
| bsr_options = get_pcre2_bsr_compile_options (regex->orig_compile_opts); |
| |
| /* For PCRE2 we need to turn off PCRE2_NO_AUTO_POSSESS, which is an |
| * optimization for normal regex matching, but results in omitting some |
| * shorter matches here, and an observable behaviour change. |
| * |
| * DFA matching is rather niche, and very rarely used according to |
| * codesearch.debian.net, so don't bother caching the recompiled RE. */ |
| pcre_re = regex_compile (regex->pattern, |
| regex->compile_opts | PCRE2_NO_AUTO_POSSESS, |
| newline_options, bsr_options, error); |
| if (pcre_re == NULL) |
| return FALSE; |
| |
| info = match_info_new (regex, string, string_len, start_position, |
| match_options, TRUE); |
| |
| done = FALSE; |
| while (!done) |
| { |
| done = TRUE; |
| info->matches = pcre2_dfa_match (pcre_re, |
| (PCRE2_SPTR8) info->string, info->string_len, |
| info->pos, |
| (regex->match_opts | info->match_opts), |
| info->match_data, |
| info->match_context, |
| info->workspace, info->n_workspace); |
| if (info->matches == PCRE2_ERROR_DFA_WSSIZE) |
| { |
| /* info->workspace is too small. */ |
| info->n_workspace *= 2; |
| info->workspace = g_realloc_n (info->workspace, |
| info->n_workspace, |
| sizeof (gint)); |
| done = FALSE; |
| } |
| else if (info->matches == 0) |
| { |
| /* info->offsets is too small. */ |
| info->n_offsets *= 2; |
| info->offsets = g_realloc_n (info->offsets, |
| info->n_offsets, |
| sizeof (gint)); |
| pcre2_match_data_free (info->match_data); |
| info->match_data = pcre2_match_data_create (info->n_offsets, NULL); |
| done = FALSE; |
| } |
| else if (IS_PCRE2_ERROR (info->matches)) |
| { |
| gchar *error_msg = get_match_error_message (info->matches); |
| |
| g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH, |
| _("Error while matching regular expression %s: %s"), |
| regex->pattern, error_msg); |
| g_clear_pointer (&error_msg, g_free); |
| } |
| else if (info->matches != PCRE2_ERROR_NOMATCH) |
| { |
| if (!recalc_match_offsets (info, error)) |
| info->matches = PCRE2_ERROR_NOMATCH; |
| } |
| } |
| |
| pcre2_code_free (pcre_re); |
| |
| /* don’t assert that (info->matches <= info->n_subpatterns + 1) as that only |
| * holds true for a single match, rather than matching all */ |
| |
| /* set info->pos to -1 so that a call to g_match_info_next() fails. */ |
| info->pos = -1; |
| retval = info->matches >= 0; |
| |
| if (match_info != NULL) |
| *match_info = info; |
| else |
| g_match_info_free (info); |
| |
| return retval; |
| } |
| |
| /** |
| * g_regex_get_string_number: |
| * @regex: #GRegex structure |
| * @name: name of the subexpression |
| * |
| * Retrieves the number of the subexpression named @name. |
| * |
| * Returns: The number of the subexpression or -1 if @name |
| * does not exists |
| * |
| * Since: 2.14 |
| */ |
| gint |
| g_regex_get_string_number (const GRegex *regex, |
| const gchar *name) |
| { |
| gint num; |
| |
| g_return_val_if_fail (regex != NULL, -1); |
| g_return_val_if_fail (name != NULL, -1); |
| |
| num = pcre2_substring_number_from_name (regex->pcre_re, (PCRE2_SPTR8) name); |
| if (num == PCRE2_ERROR_NOSUBSTRING) |
| num = -1; |
| |
| return num; |
| } |
| |
| /** |
| * g_regex_split_simple: |
| * @pattern: the regular expression |
| * @string: the string to scan for matches |
| * @compile_options: compile options for the regular expression, or 0 |
| * @match_options: match options, or 0 |
| * |
| * Breaks the string on the pattern, and returns an array of |
| * the tokens. If the pattern contains capturing parentheses, |
| * then the text for each of the substrings will also be returned. |
| * If the pattern does not match anywhere in the string, then the |
| * whole string is returned as the first token. |
| * |
| * This function is equivalent to g_regex_split() but it does |
| * not require to compile the pattern with g_regex_new(), avoiding |
| * some lines of code when you need just to do a split without |
| * extracting substrings, capture counts, and so on. |
| * |
| * If this function is to be called on the same @pattern more than |
| * once, it's more efficient to compile the pattern once with |
| * g_regex_new() and then use g_regex_split(). |
| * |
| * As a special case, the result of splitting the empty string "" |
| * is an empty vector, not a vector containing a single string. |
| * The reason for this special case is that being able to represent |
| * an empty vector is typically more useful than consistent handling |
| * of empty elements. If you do need to represent empty elements, |
| * you'll need to check for the empty string before calling this |
| * function. |
| * |
| * A pattern that can match empty strings splits @string into |
| * separate characters wherever it matches the empty string between |
| * characters. For example splitting "ab c" using as a separator |
| * "\s*", you will get "a", "b" and "c". |
| * |
| * Returns: (transfer full): a %NULL-terminated array of strings. Free |
| * it using g_strfreev() |
| * |
| * Since: 2.14 |
| **/ |
| gchar ** |
| g_regex_split_simple (const gchar *pattern, |
| const gchar *string, |
| GRegexCompileFlags compile_options, |
| GRegexMatchFlags match_options) |
| { |
| GRegex *regex; |
| gchar **result; |
| |
| regex = g_regex_new (pattern, compile_options, 0, NULL); |
| if (!regex) |
| return NULL; |
| |
| result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL); |
| g_regex_unref (regex); |
| return result; |
| } |
| |
| /** |
| * g_regex_split: |
| * @regex: a #GRegex structure |
| * @string: the string to split with the pattern |
| * @match_options: match time option flags |
| * |
| * Breaks the string on the pattern, and returns an array of the tokens. |
| * If the pattern contains capturing parentheses, then the text for each |
| * of the substrings will also be returned. If the pattern does not match |
| * anywhere in the string, then the whole string is returned as the first |
| * token. |
| * |
| * As a special case, the result of splitting the empty string "" is an |
| * empty vector, not a vector containing a single string. The reason for |
| * this special case is that being able to represent an empty vector is |
| * typically more useful than consistent handling of empty elements. If |
| * you do need to represent empty elements, you'll need to check for the |
| * empty string before calling this function. |
| * |
| * A pattern that can match empty strings splits @string into separate |
| * characters wherever it matches the empty string between characters. |
| * For example splitting "ab c" using as a separator "\s*", you will get |
| * "a", "b" and "c". |
| * |
| * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
| * it using g_strfreev() |
| * |
| * Since: 2.14 |
| **/ |
| gchar ** |
| g_regex_split (const GRegex *regex, |
| const gchar *string, |
| GRegexMatchFlags match_options) |
| { |
| return g_regex_split_full (regex, string, -1, 0, |
| match_options, 0, NULL); |
| } |
| |
| /** |
| * g_regex_split_full: |
| * @regex: a #GRegex structure |
| * @string: (array length=string_len): the string to split with the pattern |
| * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
| * @start_position: starting index of the string to match, in bytes |
| * @match_options: match time option flags |
| * @max_tokens: the maximum number of tokens to split @string into. |
| * If this is less than 1, the string is split completely |
| * @error: return location for a #GError |
| * |
| * Breaks the string on the pattern, and returns an array of the tokens. |
| * If the pattern contains capturing parentheses, then the text for each |
| * of the substrings will also be returned. If the pattern does not match |
| * anywhere in the string, then the whole string is returned as the first |
| * token. |
| * |
| * As a special case, the result of splitting the empty string "" is an |
| * empty vector, not a vector containing a single string. The reason for |
| * this special case is that being able to represent an empty vector is |
| * typically more useful than consistent handling of empty elements. If |
| * you do need to represent empty elements, you'll need to check for the |
| * empty string before calling this function. |
| * |
| * A pattern that can match empty strings splits @string into separate |
| * characters wherever it matches the empty string between characters. |
| * For example splitting "ab c" using as a separator "\s*", you will get |
| * "a", "b" and "c". |
| * |
| * Setting @start_position differs from just passing over a shortened |
| * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern |
| * that begins with any kind of lookbehind assertion, such as "\b". |
| * |
| * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
| * it using g_strfreev() |
| * |
| * Since: 2.14 |
| **/ |
| gchar ** |
| g_regex_split_full (const GRegex *regex, |
| const gchar *string, |
| gssize string_len, |
| gint start_position, |
| GRegexMatchFlags match_options, |
| gint max_tokens, |
| GError **error) |
| { |
| GError *tmp_error = NULL; |
| GMatchInfo *match_info; |
| GList *list, *last; |
| gint i; |
| gint token_count; |
| gboolean match_ok; |
| /* position of the last separator. */ |
| gint last_separator_end; |
| /* was the last match 0 bytes long? */ |
| gboolean last_match_is_empty; |
| /* the returned array of char **s */ |
| gchar **string_list; |
| |
| g_return_val_if_fail (regex != NULL, NULL); |
| g_return_val_if_fail (string != NULL, NULL); |
| g_return_val_if_fail (start_position >= 0, NULL); |
| g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
| g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
| |
| if (max_tokens <= 0) |
| max_tokens = G_MAXINT; |
| |
| if (string_len < 0) |
| string_len = strlen (string); |
| |
| /* zero-length string */ |
| if (string_len - start_position == 0) |
| return g_new0 (gchar *, 1); |
| |
| if (max_tokens == 1) |
| { |
| string_list = g_new0 (gchar *, 2); |
| string_list[0] = g_strndup (&string[start_position], |
| string_len - start_position); |
| return string_list; |
| } |
| |
| list = NULL; |
| token_count = 0; |
| last_separator_end = start_position; |
| last_match_is_empty = FALSE; |
| |
| match_ok = g_regex_match_full (regex, string, string_len, start_position, |
| match_options, &match_info, &tmp_error); |
| |
| while (tmp_error == NULL) |
| { |
| if (match_ok) |
| { |
| last_match_is_empty = |
| (match_info->offsets[0] == match_info->offsets[1]); |
| |
| /* we need to skip empty separators at the same position of the end |
| * of another separator. e.g. the string is "a b" and the separator |
| * is " *", so from 1 to 2 we have a match and at position 2 we have |
| * an empty match. */ |
| if (last_separator_end != match_info->offsets[1]) |
| { |
| gchar *token; |
| gint match_count; |
| |
| token = g_strndup (string + last_separator_end, |
| match_info->offsets[0] - last_separator_end); |
| list = g_list_prepend (list, token); |
| token_count++; |
| |
| /* if there were substrings, these need to be added to |
| * the list. */ |
| match_count = g_match_info_get_match_count (match_info); |
| if (match_count > 1) |
| { |
| for (i = 1; i < match_count; i++) |
| list = g_list_prepend (list, g_match_info_fetch (match_info, i)); |
| } |
| } |
| } |
| else |
| { |
| /* if there was no match, copy to end of string. */ |
| if (!last_match_is_empty) |
| { |
| gchar *token = g_strndup (string + last_separator_end, |
| match_info->string_len - last_separator_end); |
| list = g_list_prepend (list, token); |
| } |
| /* no more tokens, end the loop. */ |
| break; |
| } |
| |
| /* -1 to leave room for the last part. */ |
| if (token_count >= max_tokens - 1) |
| { |
| /* we have reached the maximum number of tokens, so we copy |
| * the remaining part of the string. */ |
| if (last_match_is_empty) |
| { |
| /* the last match was empty, so we have moved one char |
| * after the real position to avoid empty matches at the |
| * same position. */ |
| match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string; |
| } |
| /* the if is needed in the case we have terminated the available |
| * tokens, but we are at the end of the string, so there are no |
| * characters left to copy. */ |
| if (string_len > match_info->pos) |
| { |
| gchar *token = g_strndup (string + match_info->pos, |
| string_len - match_info->pos); |
| list = g_list_prepend (list, token); |
| } |
| /* end the loop. */ |
| break; |
| } |
| |
| last_separator_end = match_info->pos; |
| if (last_match_is_empty) |
| /* if the last match was empty, g_match_info_next() has moved |
| * forward to avoid infinite loops, but we still need to copy that |
| * character. */ |
| last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string; |
| |
| match_ok = g_match_info_next (match_info, &tmp_error); |
| } |
| g_match_info_free (match_info); |
| if (tmp_error != NULL) |
| { |
| g_propagate_error (error, tmp_error); |
| g_list_free_full (list, g_free); |
| return NULL; |
| } |
| |
| string_list = g_new (gchar *, g_list_length (list) + 1); |
| i = 0; |
| for (last = g_list_last (list); last; last = g_list_previous (last)) |
| string_list[i++] = last->data; |
| string_list[i] = NULL; |
| g_list_free (list); |
| |
| return string_list; |
| } |
| |
| enum |
| { |
| REPL_TYPE_STRING, |
| REPL_TYPE_CHARACTER, |
| REPL_TYPE_SYMBOLIC_REFERENCE, |
| REPL_TYPE_NUMERIC_REFERENCE, |
| REPL_TYPE_CHANGE_CASE |
| }; |
| |
| typedef enum |
| { |
| CHANGE_CASE_NONE = 1 << 0, |
| CHANGE_CASE_UPPER = 1 << 1, |
| CHANGE_CASE_LOWER = 1 << 2, |
| CHANGE_CASE_UPPER_SINGLE = 1 << 3, |
| CHANGE_CASE_LOWER_SINGLE = 1 << 4, |
| CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, |
| CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, |
| CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE |
| } ChangeCase; |
| |
| struct _InterpolationData |
| { |
| gchar *text; |
| gint type; |
| gint num; |
| gchar c; |
| ChangeCase change_case; |
| }; |
| |
| static void |
| free_interpolation_data (InterpolationData *data) |
| { |
| g_free (data->text); |
| g_free (data); |
| } |
| |
| static const gchar * |
| expand_escape (const gchar *replacement, |
| const gchar *p, |
| InterpolationData *data, |
| GError **error) |
| { |
| const gchar *q, *r; |
| gint x, d, h, i; |
| const gchar *error_detail; |
| gint base = 0; |
| GError *tmp_error = NULL; |
| |
| p++; |
| switch (*p) |
| { |
| case 't': |
| p++; |
| data->c = '\t'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case 'n': |
| p++; |
| data->c = '\n'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case 'v': |
| p++; |
| data->c = '\v'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case 'r': |
| p++; |
| data->c = '\r'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case 'f': |
| p++; |
| data->c = '\f'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case 'a': |
| p++; |
| data->c = '\a'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case 'b': |
| p++; |
| data->c = '\b'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case '\\': |
| p++; |
| data->c = '\\'; |
| data->type = REPL_TYPE_CHARACTER; |
| break; |
| case 'x': |
| p++; |
| x = 0; |
| if (*p == '{') |
| { |
| p++; |
| do |
| { |
| h = g_ascii_xdigit_value (*p); |
| if (h < 0) |
| { |
| error_detail = _("hexadecimal digit or “}” expected"); |
| goto error; |
| } |
| x = x * 16 + h; |
| p++; |
| } |
| while (*p != '}'); |
| p++; |
| } |
| else |
| { |
| for (i = 0; i < 2; i++) |
| { |
| h = g_ascii_xdigit_value (*p); |
| if (h < 0) |
| { |
| error_detail = _("hexadecimal digit expected"); |
| goto error; |
| } |
| x = x * 16 + h; |
| p++; |
| } |
| } |
| data->type = REPL_TYPE_STRING; |
| data->text = g_new0 (gchar, 8); |
| g_unichar_to_utf8 (x, data->text); |
| break; |
| case 'l': |
| p++; |
| data->type = REPL_TYPE_CHANGE_CASE; |
| data->change_case = CHANGE_CASE_LOWER_SINGLE; |
| break; |
| case 'u': |
| p++; |
| data->type = REPL_TYPE_CHANGE_CASE; |
| data->change_case = CHANGE_CASE_UPPER_SINGLE; |
| break; |
| case 'L': |
| p++; |
| data->type = REPL_TYPE_CHANGE_CASE; |
| data->change_case = CHANGE_CASE_LOWER; |
| break; |
| case 'U': |
| p++; |
| data->type = REPL_TYPE_CHANGE_CASE; |
| data->change_case = CHANGE_CASE_UPPER; |
| break; |
| case 'E': |
| p++; |
| data->type = REPL_TYPE_CHANGE_CASE; |
| data->change_case = CHANGE_CASE_NONE; |
| break; |
| case 'g': |
| p++; |
| if (*p != '<') |
| { |
| error_detail = _("missing “<” in symbolic reference"); |
| goto error; |
| } |
| q = p + 1; |
| do |
| { |
| p++; |
| if (!*p) |
| { |
| error_detail = _("unfinished symbolic reference"); |
| goto error; |
| } |
| } |
| while (*p != '>'); |
| if (p - q == 0) |
| { |
| error_detail = _("zero-length symbolic reference"); |
| goto error; |
| } |
| if (g_ascii_isdigit (*q)) |
| { |
| x = 0; |
| do |
| { |
| h = g_ascii_digit_value (*q); |
| if (h < 0) |
| { |
| error_detail = _("digit expected"); |
| p = q; |
| goto error; |
| } |
| x = x * 10 + h; |
| q++; |
| } |
| while (q != p); |
| data->num = x; |
| data->type = REPL_TYPE_NUMERIC_REFERENCE; |
| } |
| else |
| { |
| r = q; |
| do |
| { |
| if (!g_ascii_isalnum (*r)) |
| { |
| error_detail = _("illegal symbolic reference"); |
| p = r; |
| goto error; |
| } |
| r++; |
| } |
| while (r != p); |
| data->text = g_strndup (q, p - q); |
| data->type = REPL_TYPE_SYMBOLIC_REFERENCE; |
| } |
| p++; |
| break; |
| case '0': |
| /* if \0 is followed by a number is an octal number representing a |
| * character, else it is a numeric reference. */ |
| if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0) |
| { |
| base = 8; |
| p = g_utf8_next_char (p); |
| } |
| G_GNUC_FALLTHROUGH; |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| x = 0; |
| d = 0; |
| for (i = 0; i < 3; i++) |
| { |
| h = g_ascii_digit_value (*p); |
| if (h < 0) |
| break; |
| if (h > 7) |
| { |
| if (base == 8) |
| break; |
| else |
| base = 10; |
| } |
| if (i == 2 && base == 10) |
| break; |
| x = x * 8 + h; |
| d = d * 10 + h; |
| p++; |
| } |
| if (base == 8 || i == 3) |
| { |
| data->type = REPL_TYPE_STRING; |
| data->text = g_new0 (gchar, 8); |
| g_unichar_to_utf8 (x, data->text); |
| } |
| else |
| { |
| data->type = REPL_TYPE_NUMERIC_REFERENCE; |
| data->num = d; |
| } |
| break; |
| case 0: |
| error_detail = _("stray final “\\”"); |
| goto error; |
| break; |
| default: |
| error_detail = _("unknown escape sequence"); |
| goto error; |
| } |
| |
| return p; |
| |
| error: |
| /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ |
| tmp_error = g_error_new (G_REGEX_ERROR, |
| G_REGEX_ERROR_REPLACE, |
| _("Error while parsing replacement " |
| "text “%s” at char %lu: %s"), |
| replacement, |
| (gulong)(p - replacement), |
| error_detail); |
| g_propagate_error (error, tmp_error); |
| |
| return NULL; |
| } |
| |
| static GList * |
| split_replacement (const gchar *replacement, |
| GError **error) |
| { |
| GList *list = NULL; |
| InterpolationData *data; |
| const gchar *p, *start; |
| |
| start = p = replacement; |
| while (*p) |
| { |
| if (*p == '\\') |
| { |
| data = g_new0 (InterpolationData, 1); |
| start = p = expand_escape (replacement, p, data, error); |
| if (p == NULL) |
| { |
| g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
| free_interpolation_data (data); |
| |
| return NULL; |
| } |
| list = g_list_prepend (list, data); |
| } |
| else |
| { |
| p++; |
| if (*p == '\\' || *p == '\0') |
| { |
| if (p - start > 0) |
| { |
| data = g_new0 (InterpolationData, 1); |
| data->text = g_strndup (start, p - start); |
| data->type = REPL_TYPE_STRING; |
| list = g_list_prepend (list, data); |
| } |
| } |
| } |
| } |
| |
| return g_list_reverse (list); |
| } |
| |
| /* Change the case of c based on change_case. */ |
| #define CHANGE_CASE(c, change_case) \ |
| (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ |
| g_unichar_tolower (c) : \ |
| g_unichar_toupper (c)) |
| |
| static void |
| string_append (GString *string, |
| const gchar *text, |
| ChangeCase *change_case) |
| { |
| gunichar c; |
| |
| if (text[0] == '\0') |
| return; |
| |
| if (*change_case == CHANGE_CASE_NONE) |
| { |
| g_string_append (string, text); |
| } |
| else if (*change_case & CHANGE_CASE_SINGLE_MASK) |
| { |
| c = g_utf8_get_char (text); |
| g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
| g_string_append (string, g_utf8_next_char (text)); |
| *change_case = CHANGE_CASE_NONE; |
| } |
| else |
| { |
| while (*text != '\0') |
| { |
| c = g_utf8_get_char (text); |
| g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
| text = g_utf8_next_char (text); |
| } |
| } |
| } |
| |
| static gboolean |
| interpolate_replacement (const GMatchInfo *match_info, |
| GString *result, |
| gpointer data) |
| { |
| GList *list; |
| InterpolationData *idata; |
| gchar *match; |
| ChangeCase change_case = CHANGE_CASE_NONE; |
| |
| for (list = data; list; list = list->next) |
| { |
| idata = list->data; |
| switch (idata->type) |
| { |
| case REPL_TYPE_STRING: |
| string_append (result, idata->text, &change_case); |
| break; |
| case REPL_TYPE_CHARACTER: |
| g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); |
| if (change_case & CHANGE_CASE_SINGLE_MASK) |
| change_case = CHANGE_CASE_NONE; |
| break; |
| case REPL_TYPE_NUMERIC_REFERENCE: |
| match = g_match_info_fetch (match_info, idata->num); |
| if (match) |
| { |
| string_append (result, match, &change_case); |
| g_free (match); |
| } |
| break; |
| case REPL_TYPE_SYMBOLIC_REFERENCE: |
| match = g_match_info_fetch_named (match_info, idata->text); |
| if (match) |
| { |
| string_append (result, match, &change_case); |
| g_free (match); |
| } |
| break; |
| case REPL_TYPE_CHANGE_CASE: |
| change_case = idata->change_case; |
| break; |
| } |
| } |
| |
| return FALSE; |
| } |
| |
| /* whether actual match_info is needed for replacement, i.e. |
| * whether there are references |
| */ |
| static gboolean |
| interpolation_list_needs_match (GList *list) |
| { |
| while (list != NULL) |
| { |
| InterpolationData *data = list->data; |
| |
| if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE || |
| data->type == REPL_TYPE_NUMERIC_REFERENCE) |
| { |
| return TRUE; |
| } |
| |
| list = list->next; |
| } |
| |
| return FALSE; |
| } |
| |
| /** |
| * g_regex_replace: |
| * @regex: a #GRegex structure |
| * @string: (array length=string_len): the string to perform matches against |
| * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
| * @start_position: starting index of the string to match, in bytes |
| * @replacement: text to replace each match with |
| * @match_options: options for the match |
| * @error: location to store the error occurring, or %NULL to ignore errors |
| * |
| * Replaces all occurrences of the pattern in @regex with the |
| * replacement text. Backreferences of the form '\number' or |
| * '\g<number>' in the replacement text are interpolated by the |
| * number-th captured subexpression of the match, '\g<name>' refers |
| * to the captured subexpression with the given name. '\0' refers |
| * to the complete match, but '\0' followed by a number is the octal |
| * representation of a character. To include a literal '\' in the |
| * replacement, write '\\\\'. |
| * |
| * There are also escapes that changes the case of the following text: |
| * |
| * - \l: Convert to lower case the next character |
| * - \u: Convert to upper case the next character |
| * - \L: Convert to lower case till \E |
| * - \U: Convert to upper case till \E |
| * - \E: End case modification |
| * |
| * If you do not need to use backreferences use g_regex_replace_literal(). |
| * |
| * The @replacement string must be UTF-8 encoded even if %G_REGEX_RAW was |
| * passed to g_regex_new(). If you want to use not UTF-8 encoded strings |
| * you can use g_regex_replace_literal(). |
| * |
| * Setting @start_position differs from just passing over a shortened |
| * string and setting %G_REGEX_MATCH_NOTBOL in the case of a pattern that |
| * begins with any kind of lookbehind assertion, such as "\b". |
| * |
| * Returns: a newly allocated string containing the replacements |
| * |
| * Since: 2.14 |
| */ |
| gchar * |
| g_regex_replace (const GRegex *regex, |
| const gchar *string, |
| gssize string_len, |
| gint start_position, |
| const gchar *replacement, |
| GRegexMatchFlags match_options, |
| GError **error) |
| { |
| gchar *result; |
| GList *list; |
| GError *tmp_error = NULL; |
| |
| g_return_val_if_fail (regex != NULL, NULL); |
| g_return_val_if_fail (string != NULL, NULL); |
| g_return_val_if_fail (start_position >= 0, NULL); |
| g_return_val_if_fail (replacement != NULL, NULL); |
| g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
| g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
| |
| list = split_replacement (replacement, &tmp_error); |
| if (tmp_error != NULL) |
| { |
| g_propagate_error (error, tmp_error); |
| return NULL; |
| } |
| |
| result = g_regex_replace_eval (regex, |
| string, string_len, start_position, |
| match_options, |
| interpolate_replacement, |
| (gpointer)list, |
| &tmp_error); |
| if (tmp_error != NULL) |
| g_propagate_error (error, tmp_error); |
| |
| g_list_free_full (list, (GDestroyNotify) free_interpolation_data); |
| |
| return result; |
| } |
| |
| static gboolean |
| literal_replacement (const GMatchInfo *match_info, |
| GString *result, |
| gpointer data) |
| { |
| g_string_append (result, data); |
| return FALSE; |
| } |
| |
| /** |
| * g_regex_replace_literal: |
| * @regex: a #GRegex structure |
| * @string: (array length=string_len): the string to perform matches against |
| * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
| * @start_position: starting index of the string to match, in bytes |
| * @replacement: text to replace each match with |
| * @match_options: options for the match |
| * @error: location to store the error occurring, or %NULL to ignore errors |
| * |
| * Replaces all occurrences of the pattern in @regex with the |
| * replacement text. @replacement is replaced literally, to |
| * include backreferences use g_regex_replace(). |
| * |
| * Setting @start_position differs from just passing over a |
| * shortened string and setting %G_REGEX_MATCH_NOTBOL in the |
| * case of a pattern that begins with any kind of lookbehind |
| * assertion, such as "\b". |
| * |
| * Returns: a newly allocated string containing the replacements |
| * |
| * Since: 2.14 |
| */ |
| gchar * |
| g_regex_replace_literal (const GRegex *regex, |
| const gchar *string, |
| gssize string_len, |
| gint start_pos
|