From c0a4b41afeb066b0d5eb6c0adbf6be6244966a88 Mon Sep 17 00:00:00 2001 From: Profpatsch Date: Thu, 24 Dec 2020 03:01:10 +0100 Subject: [PATCH] tree-sitter: improve update script to fetch all available grammars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The update script would only fetch the few grammars listed in the tree-sitter repository previously. But the tree-sitter github orga has a rather large amount of officially supported grammars. Thus we change the script to query the github APIs for repositories instead (up to 100 this is supported without paging). Since the repository list also contains some that are not grammars, there is a bash script which lists all repos we are aware of and the ones we want to ignore. It will make sure we don’t forget any repositories in the future, by comparing to the actual list with jq. --- .../tools/parsing/tree-sitter/update.nix | 94 +++++++++++++++++-- 1 file changed, 86 insertions(+), 8 deletions(-) diff --git a/pkgs/development/tools/parsing/tree-sitter/update.nix b/pkgs/development/tools/parsing/tree-sitter/update.nix index 2a3575a44d2..430d2ead026 100644 --- a/pkgs/development/tools/parsing/tree-sitter/update.nix +++ b/pkgs/development/tools/parsing/tree-sitter/update.nix @@ -3,11 +3,70 @@ , src }: let - # print all the grammar names mentioned in the fetch-fixtures script - getGrammarNames = writeShellScript "get-grammars.sh" '' + # check in the list of grammars, whether we know all of them. + checkKnownGrammars = writeShellScript "get-grammars.sh" '' set -euo pipefail - sed -ne 's/^fetch_grammar \(\S*\).*$/\1/p' \ - ${src}/script/fetch-fixtures + known=' + [ "tree-sitter-javascript" + , "tree-sitter-c" + , "tree-sitter-swift" + , "tree-sitter-json" + , "tree-sitter-cpp" + , "tree-sitter-ruby" + , "tree-sitter-razor" + , "tree-sitter-go" + , "tree-sitter-c-sharp" + , "tree-sitter-python" + , "tree-sitter-typescript" + , "tree-sitter-rust" + , "tree-sitter-bash" + , "tree-sitter-php" + , "tree-sitter-java" + , "tree-sitter-scala" + , "tree-sitter-ocaml" + , "tree-sitter-julia" + , "tree-sitter-agda" + , "tree-sitter-fluent" + , "tree-sitter-html" + , "tree-sitter-haskell" + , "tree-sitter-regex" + , "tree-sitter-css" + , "tree-sitter-verilog" + , "tree-sitter-jsdoc" + , "tree-sitter-ql" + ]' + ignore=' + [ "tree-sitter" + , "tree-sitter-cli" + , "tree-sitter-embedded-template" + ${/*this is the haskell language bindings, tree-sitter-haskell is the grammar*/""} + , "haskell-tree-sitter" + ${/*this is the ruby language bindings, tree-sitter-ruby is the grammar*/""} + , "ruby-tree-sitter" + ${/*this is the (unmaintained) rust language bindings, tree-sitter-rust is the grammar*/""} + , "rust-tree-sitter" + ${/*this is the nodejs language bindings, tree-sitter-javascript is the grammar*/""} + , "node-tree-sitter" + ${/*this is the python language bindings, tree-sitter-python is the grammar*/""} + , "py-tree-sitter" + ${/*afl fuzzing for tree sitter*/""} + , "afl-tree-sitter" + ${/*archived*/""} + , "highlight-schema" + ${/*website*/""} + , "tree-sitter.github.io" + ]' + res=$(${jq}/bin/jq \ + --argjson known "$known" \ + --argjson ignore "$ignore" \ + '. - ($known + $ignore)' \ + ) + if [ ! "$res" == "[]" ]; then + echo "These repositories are neither known nor ignored:" 1>&2 + echo "$res" 1>&2 + exit 1 + fi + printf '%s' "$known" ''; # TODO @@ -22,7 +81,7 @@ let res=$(${curl}/bin/curl \ --silent \ "https://api.github.com/repos/${urlEscape owner}/$(${urlEscapeSh} "$repo")/releases/latest") - if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message')" =~ "rate limit" ]]; then + if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then echo "rate limited" >&2 fi release=$(printf "%s" "$res" | ${jq}/bin/jq '.tag_name') @@ -34,6 +93,21 @@ let echo "$release" ''; + # find the latest repos of a github organization + latestGithubRepos = { orga }: writeShellScript "latest-github-repos" '' + set -euo pipefail + res=$(${curl}/bin/curl \ + --silent \ + 'https://api.github.com/orgs/${orga}/repos?per_page=100') + + if [[ "$(printf "%s" "$res" | ${jq}/bin/jq '.message?')" =~ "rate limit" ]]; then + echo "rate limited" >&2 + fi + + printf "%s" "$res" | ${jq}/bin/jq 'map(.name)' \ + || echo "failed $res" + ''; + # update one tree-sitter grammar repo and print their nix-prefetch-git output updateGrammar = { owner }: writeShellScript "update-grammar.sh" '' set -euo pipefail @@ -49,18 +123,22 @@ let update-all-grammars = writeShellScript "update-all-grammars.sh" '' set -euo pipefail - grammarNames=$(${getGrammarNames}) + echo "fetching list of grammars" 1>&2 + grammars=$(${latestGithubRepos { orga = "tree-sitter"; }}) + echo "checking against the list of grammars we know" 1>&2 + knownGrammars=$(printf '%s' "$grammars" | ${checkKnownGrammars}) + # change the json list into a item-per-line bash format + grammarNames=$(printf '%s' "$knownGrammars" | ${jq}/bin/jq --raw-output '.[]') outputDir="${toString ./.}/grammars" mkdir -p "$outputDir" updateCommand=$(printf \ '${updateGrammar { owner = "tree-sitter"; }} "$1" > "%s/$1.json"' \ "$outputDir") printf '%s' "$grammarNames" \ - | ${xe}/bin/xe printf "tree-sitter-%s\n" {} \ | ${xe}/bin/xe -j2 -s "$updateCommand" ( echo "{" printf '%s' "$grammarNames" \ - | ${xe}/bin/xe -s 'printf " %s = (builtins.fromJSON (builtins.readFile ./tree-sitter-%s.json));\n" "$1" "$1"' + | ${xe}/bin/xe -s 'printf " %s = (builtins.fromJSON (builtins.readFile ./%s.json));\n" "$1" "$1"' echo "}" ) \ > "$outputDir/default.nix" '';