#!/usr/bin/env bash
# SPDX-License-Identifier: Apache-2.0
# Copyright (C) 2021 Arm Limited or its affiliates and Contributors. All rights reserved.

[[ "${TRACE-}" ]] && set -x
set -euo pipefail

print_help() {
    echo 'Usage: manyclangs-build-month YYYY MM'
    echo ''
    echo 'manyclangs-build-month is a "set it and forget it" script that builds'
    echo 'the specified month and produces a .pack file.'
}

commits_for_month() {
    YEAR="$1"
    MONTH="$2"

    ((NEXT_YEAR=YEAR))
    ((NEXT_MONTH=10#$MONTH+1))
    if [ $NEXT_MONTH -eq 13 ]; then
        ((NEXT_YEAR=NEXT_YEAR+1))
        ((NEXT_MONTH=1))
    fi

    FROM_INCL="$YEAR/$MONTH/01"
    TO_EXCL="$NEXT_YEAR/$NEXT_MONTH/01"

    # Consider only commits touching these directories of interest.
    INTERESTING_PATHS=(
        clang
        clang-tools-extra
        cmake
        llvm
        third-party
    )
    git-list-between origin/main "${FROM_INCL}" "${TO_EXCL}" -- "${INTERESTING_PATHS[@]}"
}

_manyclangs_list_files0_to_pack() {
    find . -type f \( -name '*.o' -or -name '*.h' \) -a -printf '%P\0'
    # Also include all LICENSE files, .log files the README and link.sh
    # available at the root of ./build.
    find . -maxdepth 1 -type f \( \
            -name link.sh \
            -or -name README \
            -or -name build.ninja \
            -or -name rules.ninja \
            -or -name 'LICENSE*' \
            -or -name '*.log' \
            -or -name 'COMMIT_SHA' \
            \) -printf '%P\0'
}

_manyclangs_build_part() {
    YEAR=$1
    MONTH=$2
    PART=$3
    BUILDNAME=mc-build-"$YEAR""$MONTH"-part"$PART"
    BUILD_DIR="$PWD/${BUILDNAME}"

    COMMIT_LIST="$PWD/manyclangs-commit-list.txt.$PART"

    ELFSHAKER_DATA=${ELFSHAKER_DATA-$PWD/elfshaker_data}
    mkdir -p "$ELFSHAKER_DATA"
    export CCACHE_DIR=${CCACHE_DIR-$PWD/ccache}
    # Assumes plenty of space available, bonus of having whole cache around to
    # play with, typical month ~= 40GiB cache.
    export CCACHE_MAXSIZE=${CCACHE_MAXSIZE-60G}

    # FD chosen to avoid collision with jobserver.
    exec 30>&1 # dup 30 to 1, writes to 30 now go to old stdout.
    exec &> "${BUILDNAME}".log # redirect everything to log.

    echo Building part "$BUILDNAME"

    # Defines where git will write files.
    # Turns out we don't need to do anything other than this.
    # Git restore will ensure that 'no-overlay' is in effect and deleted
    # files will go away.
    export GIT_WORK_TREE="$PWD/${BUILDNAME}" GIT_INDEX_FILE="${BUILDNAME}.gitidx"
    HASHES_DIR=$PWD/hashes
    mkdir -p "$HASHES_DIR"
    mkdir -p "${GIT_WORK_TREE}/build"
    pushd "${GIT_WORK_TREE}/build"
    ln -frs "${ELFSHAKER_DATA}" elfshaker_data

    # CCACHE_BASEDIR causes ccache to rewrite arguments to the compiler to be
    # relative to $CCACHE_BASEDIR. This is significant because binaries often
    # end up with paths as specified on the commandline through the macro
    # __PRETTY_FUNCTION__ (see elfshaker/elfshaker#11).
    export CCACHE_BASEDIR="$GIT_WORK_TREE"

    while read -r COMMIT_SHA SNAPSHOT_NAME
    do
        if [ -e "elfshaker_data/packs/loose/${SNAPSHOT_NAME}.pack.idx" ]
        then
            echo "manyclangs-build-month: Skip already built commit $SNAPSHOT_NAME"
            echo "Skipped $SNAPSHOT_NAME @ $(date "+%Y/%m/%d %H:%M:%S")" 1>&30 # for progress output.
            continue
        fi
        echo "manyclangs-build-month: Building commit $SNAPSHOT_NAME"

        EXTRA=()
        # Detect the cmake top-level directory introduced Jan 2022.
        if git ls-tree "$COMMIT_SHA":cmake &> /dev/null;
        then
            EXTRA+=(cmake)
        fi

        # Handy comand which sets the sources in ./llvm ./clang to the right
        # contents efficiently (including removing files which have gone). It
        # makes use of the $GIT_INDEX_FILE to keep track of the old state so is
        # efficient.
        (cd .. &&
         git restore --no-progress --staged --worktree --source="$COMMIT_SHA" -- \
             {llvm,clang,clang-tools-extra,third-party} :^{llvm,clang,clang-tools-extra}/test "${EXTRA[@]}"
        )

        echo "$COMMIT_SHA" > COMMIT_SHA
        date

        # manyclangs-build takes care of removing object files not produced by
        # the build system. consume non-zero exit status if present and
        # continue.
        if manyclangs-build "$COMMIT_SHA"
        then
            # Build success, capture object hashes and store to elfshaker.
            _manyclangs_list_files0_to_pack | xargs -0 -P"$(nproc)" sha1sum | awk '{print $1" "$2}' | sort -k2,2 > "${HASHES_DIR}"/"${SNAPSHOT_NAME}".txt
            _manyclangs_list_files0_to_pack | elfshaker store --verbose --files0-from=- "$SNAPSHOT_NAME"
        else
            # On failure, capture the build log.
            touch BUILD_FAILED
            { echo BUILD_FAILED; echo build.log; } | elfshaker store --verbose --files-from=- "$SNAPSHOT_NAME"
            rm BUILD_FAILED
        fi

        echo "Built $SNAPSHOT_NAME @ $(date "+%Y/%m/%d %H:%M:%S")" 1>&30 # for progress output.
    done < "${COMMIT_LIST}"

    popd

    if [[ -z "${MANYCLANGS_NODELETE-}" ]]
    then
        rm -r "${BUILD_DIR}"
    fi
}

_manyclangs_build_parts() {
    [[ "${TRACE-}" ]] && set -x
    set -euo pipefail # exported function, fresh shell.
    YEAR=$1
    MONTH=$2
    shift 2

    for part in "$@"
    do
        _manyclangs_build_part "$YEAR" "$MONTH" "$part"
    done
}

sanity_check() {
    DEPS=(jobserver jq compdb2line ninja cmake "clang${BUILDER_CLANG_VERSION_SUFFIX}" "clang++${BUILDER_CLANG_VERSION_SUFFIX}")
    MISSING_DEPS=()
    for DEP in "${DEPS[@]}"
    do
        if ! command -v "$DEP" &> /dev/null;
        then
            MISSING_DEPS+=("$DEP")
        fi
    done

    if [ ${#MISSING_DEPS[@]} -ne 0 ]
    then
        echo "Missing some binaries from \$PATH: ${MISSING_DEPS[*]}"
        echo "Note that compdb2line is part of this project and needs installing."
        echo "jobserver can be found as part of https://github.com/olsner/jobclient"
        exit 1
    fi

    if ! git rev-parse --git-dir &> /dev/null
    then
        echo 'Run with GIT_DIR set to path/to/llvm-project/.git or from inside a repository.'
        exit 1
    fi
}

progress_bar() {
    COMMIT_COUNT=$1
    pv --bytes --progress --fineta --timer --average-rate --line-mode --size "$COMMIT_COUNT" > /dev/null
}

apply_limit() {
    if [ -z "${MANYCLANGS_LIMIT-}" ]; then cat; else head -n "$MANYCLANGS_LIMIT"; fi
}

apply_subset() {
    if [ -z "${MANYCLANGS_SUBSET-}" ]; then cat; else awk 'NR%'"$MANYCLANGS_SUBSET"' == 0 { print }'; fi
}

main() {
    BUILDER_CLANG_VERSION_SUFFIX=${BUILDER_CLANG_VERSION_SUFFIX--12}

    sanity_check

    echo

    YEAR="$1"
    MONTH="$2"
    MANYCLANGS_PACK_NAME="$(uname -m)-${MANYCLANGS_PACK_TYPE-ubuntu2004}-manyclangs-$YEAR$MONTH"

    date -d "$YEAR/$MONTH/01" 1>/dev/null || { print_help; exit 1; }

    commits_for_month "$YEAR" "$MONTH"  > manyclangs-commit-list.txt
    apply_limit < manyclangs-commit-list.txt | apply_subset > manyclangs-commit-list.txt.subset
    mv manyclangs-commit-list.txt.subset manyclangs-commit-list.txt
    COMMIT_COUNT=$(wc -l manyclangs-commit-list.txt | awk '{print $1}')

    NPARALLEL=${MANYCLANGS_NPARALLEL-5} # Number of parts to run in parallel.
    # A few extra slots, since we loose at least one to the build system and we
    # don't expect every job to take 100% CPU.
    JOBSLOP=5
    NJOBS=${MANYCLANGS_NJOB-$(($(nproc) + JOBSLOP))}

    INTERACTIVE=1
    if [ -t 0 ]; then
        echo
        echo "manyclangs-build-month: stdin attached to terminal, running in interactive mode with progress bar."
        echo
    else
        echo
        echo "manyclangs-build-month: stdin not attached to terminal, running in batch mode."
        echo
        INTERACTIVE=
    fi

    if [[ "$NPARALLEL" != 1 ]] && command -v ninja-jobclient &> /dev/null
    then
        NPARTS=${MANYCLANGS_NPARTS-20} # Divide work into small equal-ish-sized parts.
        split --number "l/$NPARTS" --numeric-suffixes manyclangs-commit-list.txt{,.}

        echo "manyclangs-build-month $YEAR $MONTH"
        echo "  Building $YEAR $MONTH, $COMMIT_COUNT commits in $NPARTS parts"
        echo "  ... with $NPARALLEL parallel builds and $NJOBS parallel processes job slots."
        echo
        echo "  Time now is $(date "+%Y/%m/%d %H:%M:%S"), for a full month's "
        echo "  worth of commits (~2000) this should be done within 6 hours "
        echo "  on a 64 core machine, so ETA is by around $(date -d 'now +6 hours' "+%Y/%m/%d %H:%M")"
        echo
        echo "  Pack name: ${MANYCLANGS_PACK_NAME}"
        echo
        echo "  To inspect live build logs see, tail the files called $PWD/mc-build-$YEAR$MONTH-"'*'".log"
        echo
        echo "Some configurable variables, and their defaults:"
        grep -E -o '\$\{MANYCLANGS_[^-]+-[^\}]+\}' "$0"
        echo

        export -f _manyclangs_build_parts _manyclangs_build_part _manyclangs_list_files0_to_pack
        command time -v \
            jobserver -j"$NJOBS" \
            xargs --max-procs="$NPARALLEL" -n1 \
            bash -c '_manyclangs_build_parts "$@"' -- "$YEAR" "$MONTH" \
                <<<$(seq -f '%02.0f' 00 $(( NPARTS - 1 )) ) \
                | { if [ "${INTERACTIVE}" ]; then progress_bar "$COMMIT_COUNT"; else cat; fi; } \
                ;
    else
        if ! command -v ninja-jobclient &> /dev/null; then
            echo '
NOTE: ninja-jobclient is unavailable; falling back to serial builds, which will
make less good use of compute resource owing to builds being serialized by high
compilation time of few compilation units.

To obtain this, build a fork of ninja with this pull request applied, and put
the binary in your path as ninja-jobclient.

https://github.com/ninja-build/ninja/pull/1140

Sleeping 5 seconds, CTRL-C to abort, or wait to continue.
'
            sleep 5
        fi
        # Build everything in one part.
        mv manyclangs-commit-list.txt{,.00}
        # (not time -v to avoid extra processes)
        time _manyclangs_build_parts "$YEAR" "$MONTH" 00 \
            | { if [ "${INTERACTIVE}" ]; then progress_bar "$COMMIT_COUNT"; else cat; fi; }
    fi

    # Globs are already sorted.
    SNAPSHOTS_SORTED=(elfshaker_data/packs/loose/${YEAR}${MONTH}*)
    SNAPSHOTS_SORTED=("${SNAPSHOTS_SORTED[@]%.pack.idx}")
    SNAPSHOTS_SORTED=("${SNAPSHOTS_SORTED[@]#elfshaker_data/packs/}")

    time elfshaker pack "$MANYCLANGS_PACK_NAME" "${SNAPSHOTS_SORTED[@]}"
    echo
    echo
    echo "manyclangs-build-month $YEAR $MONTH all done, here are the resulting files:"
    echo
    du -sch elfshaker_data/packs/"${MANYCLANGS_PACK_NAME}".pack{,.idx}
    echo
    echo "(A typical 1 month upstream pack takes 100MiB storage)"
    echo
}

case "${1:--h}" in
    -h | --help)
        print_help
        [ $# -ne 0 ]
        exit $?
        ;;
    *)
        main "$@"
        ;;
esac
