Add copyright check to pre-commit-config

2024-10-05 20:47:46 -04:00 · 2021-09-08 17:38:14 +02:00 · 2021-09-08 17:38:14 +02:00 · 798a174686
commit 798a174686
parent d5877fdf37
5 changed files with 4782 additions and 10 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,7 +3,7 @@
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.4.0
+    rev: v4.0.1
    hooks:
      - id: trailing-whitespace
        # note: whitespace exclusions use multiline regex, see https://pre-commit.com/#regular-expressions
@ -26,12 +26,12 @@ repos:
        args: ['-f=lf']
      - id: double-quote-string-fixer
  - repo: https://gitlab.com/pycqa/flake8
-    rev: 3.8.4
+    rev: 3.9.2
    hooks:
      - id: flake8
        args: ['--config=.flake8', '--tee', '--benchmark']
  - repo: https://github.com/pycqa/isort
-    rev: 5.6.4
+    rev: 5.9.3
    hooks:
      - id: isort
        name: isort (python)
@ -92,11 +92,22 @@ repos:
      - id: mypy-check
        name: Check type annotations in python files
        entry: tools/ci/check_type_comments.py
-        additional_dependencies: ['mypy==0.800', 'mypy-extensions==0.4.3']
+        additional_dependencies:
          - 'mypy==0.800'
          - 'mypy-extensions==0.4.3'
        language: python
        types: [python]
      - id: check-copyright
        name: Check copyright notices
        entry: tools/ci/check_copyright.py --verbose --replace
        additional_dependencies:
          - comment_parser == 1.2.3
          - thefuzz[speedup] == 0.19.0
        language: python
        files: \.(py|c|h|cpp|hpp|ld)$
        require_serial: true
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.4.0
+    rev: v4.0.1
    hooks:
      - id: file-contents-sorter
-        files: '(tools\/ci\/executable-list\.txt|tools\/ci\/mypy_ignore_list\.txt)'
+        files: 'tools\/ci\/(executable-list\.txt|mypy_ignore_list\.txt|check_copyright_ignore\.txt)'
--- a/docs/en/contribute/install-pre-commit-hook.rst
+++ b/docs/en/contribute/install-pre-commit-hook.rst
@ -6,7 +6,7 @@ Required Dependency
 Python 3.6.1 or above. This is our recommendation python version for IDF developers.
-If you still have versions not compatible, please do not install pre-commit hook and update your python versions.
+If you still have python versions not compatible, please do not install pre-commit hook and update your python versions.
 Install pre-commit
 ------------------
@ -30,17 +30,29 @@ Run ``pre-commit uninstall``
 What's More?
 ------------
-For detailed usage, Please refer to the documentation of pre-commit_.
+For detailed usage, please refer to the documentation of pre-commit_.
-.. _pre-commit: http://www.pre-commit.com/
+.. _pre-commit: https://www.pre-commit.com/
 Common Problems For Windows Users
 ---------------------------------
-1. ``/usr/bin/env: python: Permission denied.``
+``/usr/bin/env: python: Permission denied.``
   If you're in Git Bash or MSYS terminal, please check the python executable location by run ``which python``.
   If the executable is under ``~/AppData/Local/Microsoft/WindowsApps/``, then it's a link to Windows AppStore, not a real one.
   Please install python manually and update this in your ``PATH`` environment variable.
 Your %USERPROFILE% contains non-ASCII characters
   ``pre-commit`` may fail when initializing an environment for a particular hook when the path of ``pre-commit``'s cache contains non-ASCII characters. The solution is to set ``PRE_COMMIT_HOME`` to a path containing only standard characters before running pre-commit.
   - CMD: ``set PRE_COMMIT_HOME=C:\somepath\pre-commit``
   - PowerShell: ``$Env:PRE_COMMIT_HOME = "C:\somepath\pre-commit"``
   - git bash: ``export PRE_COMMIT_HOME="/c/somepath/pre-commit"``
--- a/tools/ci/check_copyright.py
+++ b/tools/ci/check_copyright.py
@ -0,0 +1,457 @@
 #!/usr/bin/env python
 # SPDX-FileCopyrightText: 2021 Espressif Systems (Shanghai) CO LTD
 # SPDX-License-Identifier: Apache-2.0
 """
 Check files for copyright headers:
 - file not on ignore list:
    - old Espressif copyright -> replace with SPDX
    - SPDX with invalid year or old company name -> replace with valid SPDX
    - other SPDX copyright -> PASS
    - non-SPDX copyright -> FAIL
    - no copyright -> insert Espressif copyright
 - file on ignore list:
    - old Espressif copyright -> replace with SPDX, remove from ignore list
    - SPDX with invalid year or company format -> replace with valid SPDX and remove from ignore list
    else -> keep on ignore list
 """
 import argparse
 import datetime
 import os
 import re
 import sys
 import textwrap
 from typing import List, Tuple
 from comment_parser import comment_parser
 from comment_parser.parsers.common import Comment
 from thefuzz import fuzz
 IDF_PATH = os.getenv('IDF_PATH', os.getcwd())
 IGNORE_LIST_FN = os.path.join(IDF_PATH, 'tools/ci/check_copyright_ignore.txt')
 CHECK_FAIL_MESSAGE = textwrap.dedent('''\
    To make a file, not on the ignore list to pass the test it needs to contain both:
    an SPDX-FileCopyrightText and
    an SPDX-License-Identifier. For example:
    {example}
    More information about SPDX license identifiers can be found here:
    https://spdx.github.io/spdx-spec/appendix-V-using-SPDX-short-identifiers-in-source-files/
    To have this hook automatically insert the standard Espressif copyright notice,
    ensure the word "copyright" is not in any comment up to line 30 and the file is not on the ignore list.
    Below is a list of files, which failed the copyright check.
    Files prefixed with "(ignore)" are on the ignore list and their presence alone won't cause the check to fail.
    ''')
 CHECK_MODIFY_MESSAGE = textwrap.dedent('''\
    Above is a list of files, which were modified. Please check their contents, stage them and run the commit again!
    Files prefixed with "(ignore)" were on the ignore list at the time of invoking this script.
    They may have been removed if noted above.
    Pre-commit's option --show-diff-on-failure may be used to show a diff when hooks modify files.
    ''')
 # This is an old header style, which this script
 # attempts to detect and replace with a new SPDX license identifier
 OLD_APACHE_HEADER = textwrap.dedent('''\
    Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
    ''')
 # New headers to be used
 NEW_APACHE_HEADER_PYTHON = textwrap.dedent('''\
    # SPDX-FileCopyrightText: {years} Espressif Systems (Shanghai) CO LTD
    # SPDX-License-Identifier: Apache-2.0
    ''')
 PYTHON_NOTICE = '# SPDX-FileCopyrightText: {years} Espressif Systems (Shanghai) CO LTD'
 NOTICE_MULTILINE = ' * SPDX-FileCopyrightText: {years} Espressif Systems (Shanghai) CO LTD'
 NOTICE = '// SPDX-FileCopyrightText: {years} Espressif Systems (Shanghai) CO LTD'
 NEW_APACHE_HEADER = textwrap.dedent('''\
    /*
     * SPDX-FileCopyrightText: {years} Espressif Systems (Shanghai) CO LTD
     *
     * SPDX-License-Identifier: Apache-2.0
     */
    ''')
 MIME = {
    'python': 'text/x-python',
    'c': 'text/x-c',
    'cpp': 'text/x-c++'
 }
 # terminal color outupu
 TERMINAL_RESET = '\33[0m'
 TERMINAL_YELLOW = '\33[93m'
 TERMINAL_GREEN = '\33[92m'
 TERMINAL_RED = '\33[91m'
 TERMINAL_GRAY = '\33[90m'
 class UnsupportedFileType(Exception):
    """Exception raised for unsupported file types.
    Attributes:
        file_name -- input file which caused the error
        message -- explanation of the error
    """
    def __init__(self, file_name: str, message: str='this file type is not supported') -> None:
        self.fine_name = file_name
        self.message = message
        super().__init__(self.message)
    def __str__(self) -> str:
        return f'{self.fine_name}: {self.message}'
 class NotFound(Exception):
    """Exception raised when something is not found.
    Attributes:
        thing -- what was not found
    """
    def __init__(self, thing: str='something') -> None:
        self.thing = thing
        super().__init__(self.thing)
    def __str__(self) -> str:
        return f'{self.thing} was not found'
 class CustomFile():
    """
    Custom data object to hold file name and if it's on the ignore list
    and to make it easier to print
    """
    def __init__(self, file_name: str, is_on_ignore_list: bool) -> None:
        self.file_name = file_name
        self.is_on_ignore_list = is_on_ignore_list
    def __str__(self) -> str:
        if self.is_on_ignore_list:
            return f'(ignore) {self.file_name}'
        return f'         {self.file_name}'
 def get_file_mime(fn: str) -> str:
    """
    Return the mime type based on file's extension
    """
    if fn.endswith('.py'):
        return MIME['python']
    if fn.endswith(('.cpp', '.hpp')):
        return MIME['cpp']
    if fn.endswith(('.c', '.h', '.ld')):
        return MIME['c']
    raise UnsupportedFileType(fn)
 def get_comments(code: str, mime: str) -> list:
    """
    Extracts all comments from source code and does a multiline split
    """
    comments = comment_parser.extract_comments_from_str(code, mime)
    new_comments = []
    for comment in comments:
        if comment.is_multiline():
            comment_lines = comment.text().splitlines()
            for line_number, line in enumerate(comment_lines, start=comment.line_number()):
                new_comments.append(Comment(line, line_number, True))
        else:
            new_comments.append(comment)
    return new_comments
 def has_valid_copyright(file_name: str, mime: str, is_on_ignore: bool, args: argparse.Namespace) -> Tuple[bool, bool]:
    """
    Detects if a file has a valid SPDX copyright notice.
    returns: Tuple[valid, modified]
    """
    detected_licenses = []
    detected_notices = []
    valid, modified = False, False
    with open(file_name, 'r') as f:
        code = f.read()
    comments = get_comments(code, mime)
    code_lines = code.splitlines()
    if not code_lines:  # file is empty
        print(f'{TERMINAL_YELLOW}"{file_name}" is empty!{TERMINAL_RESET}')
        valid = True
        return valid, modified
    if args.replace:
        try:
            year, line = detect_old_header_style(file_name, comments, args)
        except NotFound as e:
            if args.verbose:
                print(f'{TERMINAL_GRAY}{e} in {file_name}{TERMINAL_RESET}')
        else:
            code_lines = replace_copyright(code_lines, year, line, mime, file_name)
            valid = True
    for comment in comments:
        if comment.line_number() > args.max_lines:
            break
        matches = re.search(r'SPDX-FileCopyrightText: ?(.*)', comment.text(), re.IGNORECASE)
        if matches:
            detected_notices.append((matches.group(1), comment.line_number()))
            try:
                year = extract_year_from_espressif_notice(matches.group(1))
            except NotFound as e:
                if args.verbose:
                    print(f'{TERMINAL_GRAY}Not an {e.thing} {file_name}:{comment.line_number()}{TERMINAL_RESET}')
            else:
                template = NOTICE
                if comment.is_multiline():
                    template = NOTICE_MULTILINE
                if mime == MIME['python']:
                    template = PYTHON_NOTICE
                code_lines[comment.line_number() - 1] = template.format(years=format_years(year, file_name))
        matches = re.search(r'SPDX-License-Identifier: ?(.*)', comment.text(), re.IGNORECASE)
        if matches:
            detected_licenses.append((matches.group(1), comment.line_number()))
    if not is_on_ignore and not contains_any_copyright(comments, args):
        code_lines = insert_copyright(code_lines, file_name, mime)
        print(f'"{file_name}": inserted copyright notice - please check the content and run commit again!')
        valid = True
    new_code = '\n'.join(code_lines) + '\n'
    if code != new_code:
        with open(file_name, 'w') as f:
            f.write(new_code)
        modified = True
    if detected_licenses and detected_notices:
        if args.debug:
            print(f'{file_name} notices: {detected_notices}')
            print(f'{file_name} licenses: {detected_licenses}')
        valid = True
    return valid, modified
 def contains_any_copyright(comments: list, args: argparse.Namespace) -> bool:
    """
    Return True if any comment contain the word "copyright"
    """
    return any(
        comment.line_number() <= args.max_lines
        and re.search(r'copyright', comment.text(), re.IGNORECASE)
        for comment in comments
    )
 def insert_copyright(code_lines: list, file_name: str, mime: str) -> list:
    """
    Insert a copyright notice in the begining of a file, respecting a potencial shebang
    """
    new_code_lines = []
    # if first line contains a shebang, keep it first
    if code_lines[0].startswith('#!'):
        new_code_lines.append(code_lines[0])
        del code_lines[0]
    template = NEW_APACHE_HEADER
    if mime == MIME['python']:
        template = NEW_APACHE_HEADER_PYTHON
    new_code_lines.extend(template.format(years=format_years(0, file_name)).splitlines())
    new_code_lines.extend(code_lines)
    return new_code_lines
 def extract_year_from_espressif_notice(notice: str) -> int:
    """
    Extracts copyright year (creation date) from a Espressif copyright notice
    """
    matches = re.search(r'(\d{4})(?:-\d{4})? Espressif Systems', notice, re.IGNORECASE)
    if matches:
        return int(matches.group(1))
    raise NotFound('Espressif copyright notice')
 def replace_copyright(code_lines: list, year: int, line: int, mime: str, file_name: str) -> list:
    """
    Replaces old header style with new SPDX form.
    """
    # replace from line number (line) to line number (line + number of lines in the OLD HEADER)
    # with new header depending on file type
    end = line + OLD_APACHE_HEADER.count('\n')
    del code_lines[line - 1:end - 1]
    template = NEW_APACHE_HEADER
    if mime == MIME['python']:
        template = NEW_APACHE_HEADER_PYTHON
    code_lines[line - 1:line - 1] = template.format(years=format_years(year, file_name)).splitlines()
    print(f'{TERMINAL_GRAY}"{file_name}": replacing old header (lines: {line}-{end}) with new SPDX header style.{TERMINAL_RESET}')
    return code_lines
 def detect_old_header_style(file_name: str, comments: list, args: argparse.Namespace) -> Tuple[int, int]:
    """
    Detects old header style (Apache-2.0) and extracts the year and line number.
    returns: Tuple[year, comment line number]
    """
    comments_text = str()
    for comment in comments:
        if comment.line_number() > args.max_lines:
            break
        comments_text = f'{comments_text}\n{comment.text().strip()}'
    ratio = fuzz.partial_ratio(comments_text, OLD_APACHE_HEADER)
    if args.debug:
        print(f'{TERMINAL_GRAY}ratio for {file_name}: {ratio}{TERMINAL_RESET}')
    if ratio > args.fuzzy_ratio:
        for comment in comments:
            # only check up to line number MAX_LINES
            if comment.line_number() > args.max_lines:
                break
            try:
                year = extract_year_from_espressif_notice(comment.text())
            except NotFound:
                pass
            else:
                return (year, comment.line_number())
    raise NotFound('Old Espressif header')
 def format_years(past: int, file_name: str) -> str:
    """
    Function to format a year:
     - just current year -> output: [year]
     - some year in the past -> output: [past year]-[current year]
    """
    today = datetime.datetime.now().year
    if past == 0:
        # use the current year
        past = today
    if past == today:
        return str(past)
    if past > today or past < 1972:
        error_msg = f'{file_name}: invalid year in the copyright header detected. ' \
            + 'Check your system clock and the copyright header.'
        raise ValueError(error_msg)
    return '{past}-{today}'.format(past=past, today=today)
 def check_copyrights(args: argparse.Namespace) -> Tuple[List, List]:
    """
    Main logic and for loop
    returns:
        list of files with wrong headers
        list of files which were modified
    """
    wrong_header_files = []
    modified_files = []
    with open(IGNORE_LIST_FN, 'r') as f:
        ignore_list = [item.strip() for item in f.readlines()]
        updated_ignore_list = ignore_list.copy()
    for file_name in args.filenames:
        try:
            mime = get_file_mime(file_name)
        except UnsupportedFileType:
            print(f'{TERMINAL_GRAY}"{file_name}" is not of a supported type! Skipping.{TERMINAL_RESET}')
            continue
        if file_name in ignore_list:
            if args.verbose:
                print(f'{TERMINAL_GRAY}"{file_name}" is on the ignore list.{TERMINAL_RESET}')
            valid, modified = has_valid_copyright(file_name, mime, True, args)
            if modified:
                modified_files.append(CustomFile(file_name, True))
            if valid:
                if args.dont_update_ignore_list:
                    print(f'{TERMINAL_YELLOW}"{file_name}" now has a correct copyright header - remove it from the ignore list '
                          f'or run this script without the --dont-update-ignore-list option to do this automatically!{TERMINAL_RESET}')
                else:
                    updated_ignore_list.remove(file_name)
            else:
                wrong_header_files.append(CustomFile(file_name, True))
        else:
            valid, modified = has_valid_copyright(file_name, mime, False, args)
            if modified:
                modified_files.append(CustomFile(file_name, False))
            if not valid:
                wrong_header_files.append(CustomFile(file_name, False))
    if updated_ignore_list != ignore_list:
        with open(IGNORE_LIST_FN, 'w') as f:
            for item in updated_ignore_list:
                f.write(f'{item}\n')
        modified_files.append(CustomFile(IGNORE_LIST_FN, False))
        print(f'\n{TERMINAL_GREEN}Files removed from ignore list:{TERMINAL_RESET}')
        for file in ignore_list:
            if file not in updated_ignore_list:
                print(f'    {file}')
    return wrong_header_files, modified_files
 def build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description='Check copyright headers')
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='print more information (useful for debugging)')
    parser.add_argument('-r', '--replace', action='store_true',
                        help='tries to update copyright notices')
    parser.add_argument('-m', '--max-lines', type=int, default=30,
                        help='how far to check for copyright notice in a file (default 30)')
    parser.add_argument('-f', '--fuzzy-ratio', type=int, default=95,
                        help='minimum %% ratio to be considered as equal to the old header style (default 95)')
    parser.add_argument('-d', '--debug', action='store_true',
                        help='print debug info')
    parser.add_argument('-du', '--dont-update-ignore-list', action='store_true')
    parser.add_argument('filenames', nargs='+', help='file(s) to check', metavar='file')
    return parser
 def main() -> None:
    args = build_parser().parse_args()
    if args.debug:
        print(f'{TERMINAL_GRAY}Running with args: {args}')
        print(f'Ignore list: {IGNORE_LIST_FN}{TERMINAL_RESET}')
    wrong_header_files, modified_files = check_copyrights(args)
    if modified_files:
        print(f'\n{TERMINAL_YELLOW}Modified files:{TERMINAL_RESET}')
        for file in modified_files:
            print(file)
        print(CHECK_MODIFY_MESSAGE)
    abort_commit = bool(modified_files)
    if wrong_header_files:
        print(f'{TERMINAL_YELLOW}Information about this test{TERMINAL_RESET}')
        print(CHECK_FAIL_MESSAGE.format(example=NEW_APACHE_HEADER.format(years=datetime.datetime.now().year)))
        print(f'{TERMINAL_RED}Files which failed the copyright check:{TERMINAL_RESET}')
        for wrong_file in wrong_header_files:
            if not wrong_file.is_on_ignore_list:
                abort_commit = True
            print(wrong_file)
    num_files_processed = len(args.filenames)
    if abort_commit:
        num_files_modified = len(modified_files)
        num_files_wrong = len(wrong_header_files)
        print(f'{TERMINAL_YELLOW}Processed {num_files_processed} source file{"s"[:num_files_processed^1]},', end=' ')
        print(f'{num_files_modified} modified and {num_files_wrong} with invalid copyright.{TERMINAL_RESET}')
        sys.exit(1)  # sys.exit(1) to abort the commit
    # pre-commit also automatically aborts a commit if files are modified on disk
    print(f'\n{TERMINAL_GREEN}Successfuly processed {num_files_processed} file{"s"[:num_files_processed^1]}.{TERMINAL_RESET}\n')
 if __name__ == '__main__':
    main()
--- a/tools/ci/check_copyright_ignore.txt
+++ b/tools/ci/check_copyright_ignore.txt
--- a/tools/ci/executable-list.txt
+++ b/tools/ci/executable-list.txt
@ -39,6 +39,7 @@ tools/ci/check_api_violation.sh
 tools/ci/check_build_warnings.py
 tools/ci/check_callgraph.py
 tools/ci/check_codeowners.py
 tools/ci/check_copyright.py
 tools/ci/check_deprecated_kconfigs.py
 tools/ci/check_examples_cmake_make.py
 tools/ci/check_examples_rom_header.sh