Re: [PATCH] Fix invalid escape sequence warnings

From: Vishal Chourasia
Date: Tue Aug 29 2023 - 03:45:14 EST


On 8/23/23 05:00, Andrii Nakryiko wrote:
> On Wed, Aug 16, 2023 at 5:22 AM Vishal Chourasia <vishalc@xxxxxxxxxxxxx> wrote:
>>
>> The Python script `bpf_doc.py` uses regular expressions with
>> backslashes in string literals, which results in SyntaxWarnings
>> during its execution.
>>
>> This patch addresses these warnings by converting relevant string
>> literals to raw strings, which interpret backslashes as literal
>> characters. This ensures that the regular expressions are parsed
>> correctly without causing any warnings.
>>
>> Signed-off-by: Vishal Chourasia <vishalc@xxxxxxxxxxxxx>
>> Reported-by: Srikar Dronamraju <srikar@xxxxxxxxxxxxxxxxxx>
>>
>> ---
>> scripts/bpf_doc.py | 34 +++++++++++++++++-----------------
>> 1 file changed, 17 insertions(+), 17 deletions(-)
>>
>> diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py
>> index eaae2ce78381..dfd819c952b2 100755
>> --- a/scripts/bpf_doc.py
>> +++ b/scripts/bpf_doc.py
>> @@ -59,9 +59,9 @@ class Helper(APIElement):
>> Break down helper function protocol into smaller chunks: return type,
>> name, distincts arguments.
>> """
>> - arg_re = re.compile('((\w+ )*?(\w+|...))( (\**)(\w+))?$')
>> + arg_re = re.compile(r'((\w+ )*?(\w+|...))( (\**)(\w+))?$')
>> res = {}
>> - proto_re = re.compile('(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
>> + proto_re = re.compile(r'(.+) (\**)(\w+)\(((([^,]+)(, )?){1,5})\)$')
>>
>> capture = proto_re.match(self.proto)
>> res['ret_type'] = capture.group(1)
>> @@ -114,11 +114,11 @@ class HeaderParser(object):
>> return Helper(proto=proto, desc=desc, ret=ret)
>>
>> def parse_symbol(self):
>> - p = re.compile(' \* ?(BPF\w+)$')
>> + p = re.compile(r' \* ?(BPF\w+)$')
>> capture = p.match(self.line)
>> if not capture:
>> raise NoSyscallCommandFound
>> - end_re = re.compile(' \* ?NOTES$')
>> + end_re = re.compile(r' \* ?NOTES$')
>> end = end_re.match(self.line)
>> if end:
>> raise NoSyscallCommandFound
>> @@ -133,7 +133,7 @@ class HeaderParser(object):
>> # - Same as above, with "const" and/or "struct" in front of type
>> # - "..." (undefined number of arguments, for bpf_trace_printk())
>> # There is at least one term ("void"), and at most five arguments.
>> - p = re.compile(' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
>> + p = re.compile(r' \* ?((.+) \**\w+\((((const )?(struct )?(\w+|\.\.\.)( \**\w+)?)(, )?){1,5}\))$')
>> capture = p.match(self.line)
>> if not capture:
>> raise NoHelperFound
>> @@ -141,7 +141,7 @@ class HeaderParser(object):
>> return capture.group(1)
>>
>> def parse_desc(self, proto):
>> - p = re.compile(' \* ?(?:\t| {5,8})Description$')
>> + p = re.compile(r' \* ?(?:\t| {5,8})Description$')
>> capture = p.match(self.line)
>> if not capture:
>> raise Exception("No description section found for " + proto)
>> @@ -154,7 +154,7 @@ class HeaderParser(object):
>> if self.line == ' *\n':
>> desc += '\n'
>> else:
>> - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
>> + p = re.compile(r' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
>> capture = p.match(self.line)
>> if capture:
>> desc_present = True
>> @@ -167,7 +167,7 @@ class HeaderParser(object):
>> return desc
>>
>> def parse_ret(self, proto):
>> - p = re.compile(' \* ?(?:\t| {5,8})Return$')
>> + p = re.compile(r' \* ?(?:\t| {5,8})Return$')
>> capture = p.match(self.line)
>> if not capture:
>> raise Exception("No return section found for " + proto)
>> @@ -180,7 +180,7 @@ class HeaderParser(object):
>> if self.line == ' *\n':
>> ret += '\n'
>> else:
>> - p = re.compile(' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
>> + p = re.compile(r' \* ?(?:\t| {5,8})(?:\t| {8})(.*)')
>> capture = p.match(self.line)
>> if capture:
>> ret_present = True
>> @@ -219,12 +219,12 @@ class HeaderParser(object):
>> self.seek_to('enum bpf_cmd {',
>> 'Could not find start of bpf_cmd enum', 0)
>> # Searches for either one or more BPF\w+ enums
>> - bpf_p = re.compile('\s*(BPF\w+)+')
>> + bpf_p = re.compile(r'\s*(BPF\w+)+')
>> # Searches for an enum entry assigned to another entry,
>> # for e.g. BPF_PROG_RUN = BPF_PROG_TEST_RUN, which is
>> # not documented hence should be skipped in check to
>> # determine if the right number of syscalls are documented
>> - assign_p = re.compile('\s*(BPF\w+)\s*=\s*(BPF\w+)')
>> + assign_p = re.compile(r'\s*(BPF\w+)\s*=\s*(BPF\w+)')
>> bpf_cmd_str = ''
>> while True:
>> capture = assign_p.match(self.line)
>> @@ -239,7 +239,7 @@ class HeaderParser(object):
>> break
>> self.line = self.reader.readline()
>> # Find the number of occurences of BPF\w+
>> - self.enum_syscalls = re.findall('(BPF\w+)+', bpf_cmd_str)
>> + self.enum_syscalls = re.findall(r'(BPF\w+)+', bpf_cmd_str)
>>
>> def parse_desc_helpers(self):
>> self.seek_to(helpersDocStart,
>> @@ -263,7 +263,7 @@ class HeaderParser(object):
>> self.seek_to('#define ___BPF_FUNC_MAPPER(FN, ctx...)',
>> 'Could not find start of eBPF helper definition list')
>> # Searches for one FN(\w+) define or a backslash for newline
>> - p = re.compile('\s*FN\((\w+), (\d+), ##ctx\)|\\\\')
>> + p = re.compile(r'\s*FN\((\w+), (\d+), ##ctx\)|\\\\')
>> fn_defines_str = ''
>> i = 0
>> while True:
>> @@ -278,7 +278,7 @@ class HeaderParser(object):
>> break
>> self.line = self.reader.readline()
>> # Find the number of occurences of FN(\w+)
>> - self.define_unique_helpers = re.findall('FN\(\w+, \d+, ##ctx\)', fn_defines_str)
>> + self.define_unique_helpers = re.findall(r'FN\(\w+, \d+, ##ctx\)', fn_defines_str)
>>
>> def validate_helpers(self):
>> last_helper = ''
>> @@ -425,7 +425,7 @@ class PrinterRST(Printer):
>> try:
>> cmd = ['git', 'log', '-1', '--pretty=format:%cs', '--no-patch',
>> '-L',
>> - '/{}/,/\*\//:include/uapi/linux/bpf.h'.format(delimiter)]
>> + r'/{}/,/\*\//:include/uapi/linux/bpf.h'.format(delimiter)]
>
> this one is not a regex, do we still need to change it?
Indeed, it's essential to modify this aspect, especially since we're
encountering warnings when compiling the kernel with Python 3.12.

The choice between using an 'r'-prefixed string or manually escaping
each backslash is largely a matter of preference. It's worth noting that
the 'r' prefix is not only commonly employed for regular expressions but
also in contexts where backslashes should be treated as literal
characters rather than escape sequences.

Should I send another patch escaping backslash for non regex string
literals?
>> date = subprocess.run(cmd, cwd=linuxRoot,
>> capture_output=True, check=True)
>> return date.stdout.decode().rstrip()
>> @@ -496,7 +496,7 @@ HELPERS
>> date=lastUpdate))
>>
>> def print_footer(self):
>> - footer = '''
>> + footer = r'''
>
> same here, not a regex string
>
>> EXAMPLES
>> ========
>>
>> @@ -598,7 +598,7 @@ SEE ALSO
>> one_arg = '{}{}'.format(comma, a['type'])
>> if a['name']:
>> if a['star']:
>> - one_arg += ' {}**\ '.format(a['star'].replace('*', '\\*'))
>> + one_arg += r' {}**\ '.format(a['star'].replace('*', '\\*'))
>
> and this one as well?
>
>> else:
>> one_arg += '** '
>> one_arg += '*{}*\\ **'.format(a['name'])
>> --
>> 2.41.0
>>