#!/usr/bin/env bash
###############################################################################
# fortran_code_stats.sh
#
# Purpose
# -------
# Analyse a Fortran source tree and compute structural and size statistics
# for each source file and directory.
# The script scans a directory recursively and reports metrics describing
# code size, program structure, and approximate control-flow complexity.
#
# File-level metrics
# ------------------
# For each source file the script computes:
#   - total lines
#   - code lines (internal metric)
#   - percentage of code lines in the file
#   - percentage contribution to the total code base
#   - number of programs
#   - number of modules
#   - number of subroutines
#   - number of functions
#   - number of branching constructs (complexity proxy)
#   - number of I/O operations
#   - average lines per subroutine
#   - average branching constructs per procedure
#
# Directory-level metrics
# -----------------------
# A second table aggregates statistics by directory, including:
#   - total code lines
#   - total lines
#   - number of subroutines
#   - average lines per subroutine
#   - percentage of total code base
#
# Global statistics
# -----------------
# The script also reports global properties of the analysed codebase:
#   - number of files analysed
#   - total number of lines
#   - minimum file size
#   - maximum file size
#   - average file size
#
# Code line definition
# --------------------
# A line is considered a "code line" if it is NOT:
#   - empty
#   - a full-line comment starting with "!"
#
# Inline comments are ignored when detecting structural constructs.
#
# Branch complexity proxy
# -----------------------
# Branch counts approximate control-flow complexity by detecting common
# Fortran branching constructs such as:
#   IF, ELSEIF, ELSE, SELECT CASE, CASE, DO, DO WHILE, WHERE, FORALL,
#   CYCLE, EXIT
#
# This is not a strict cyclomatic complexity metric but provides a
# lightweight indicator of structural complexity.
#
# Supported source files
# ----------------------
# The following common Fortran file extensions are scanned:
#   *.f90 *.F90 *.f *.F *.for *.f95 *.inc
#
# Usage
# -----
#   ./fortran_code_stats.sh
#
# Enable CSV export:
#   ./fortran_code_stats.sh --csv
#
# CSV outputs
# -----------
# When CSV export is enabled the script produces:
#   code_filestats.csv   : per-file statistics
#   code_dirstats.csv    : aggregated directory statistics
#
# These files can be analysed using spreadsheet software or scientific
# tools such as Python, R, or MATLAB.
#
# Typical applications
# --------------------
# This tool is useful for analysing scientific and HPC Fortran codebases:
#   - estimating code size and structure
#   - identifying large or complex source files
#   - comparing module sizes across a project
#   - locating potential refactoring targets
#   - obtaining quick metrics for large simulation codes
#
###############################################################################

# -----------------------------------------------------------------------------
# Options
# -----------------------------------------------------------------------------

WRITE_CSV=false

if [[ "${1:-}" == "--csv" ]]; then
    WRITE_CSV=true
fi

csv_file="code_filestats.csv"
directory_csv="code_dirstats.csv"

# -----------------------------------------------------------------------------
# Global counters
# -----------------------------------------------------------------------------

total_all=0
total_code=0
file_count=0

min_code=""
max_code=""

min_lines=""
max_lines=""
sum_lines=0

declare -a files_stats

declare -A directory_code
declare -A directory_all
declare -A directory_subs
declare -A directory_lines_per_sub_sum
declare -A directory_file_count
declare -A directory_modules
declare -A directory_programs
declare -A directory_functions
declare -A directory_branches
declare -A directory_io

# Force numeric decimal separator to '.'
export LC_NUMERIC=C

# =============================================================================
# Function: count_fortran_metrics
#
# Analyse a single Fortran source file and extract structural metrics.
#
# The function performs the following steps:
#   1. Count total lines and code lines.
#   2. Remove inline comments to simplify pattern detection.
#   3. Detect program structure elements (modules, programs, procedures).
#   4. Estimate control-flow complexity using branching constructs.
#   5. Count basic I/O operations.
#   6. Compute derived metrics such as lines per subroutine.
#
# Arguments
# ---------
# $1 : path to the Fortran source file
#
# Output (tab-separated)
# ----------------------
# code_lines
# total_lines
# modules
# programs
# subroutines
# functions
# branches
# io_operations
# lines_per_subroutine
# branches_per_procedure
# =============================================================================

count_fortran_metrics() {

    local file="$1"

    # --------------------------------
    # Code / total lines
    read code total <<< "$(awk '
        {
            total++
            if ($0 !~ /^\s*$/ && $0 !~ /^\s*!/) code++
        }
        END {print code "\t" total}
    ' "$file")"
    clean_code=$(sed 's/!.*$//' "$file")

    # --------------------------------
    # Structural elements
    modules=$(echo "$clean_code" | grep -Ei '^\s*module\s+[a-zA-Z_]' | grep -vi "procedure" | wc -l)
    programs=$(echo "$clean_code" | grep -Ei '^\s*program\s+' | wc -l)
    subroutines=$(echo "$clean_code" | grep -Ei '^\s*subroutine\s+[a-zA-Z_]' | grep -vi '^\s*end' | wc -l)
    functions=$(echo "$clean_code" | grep -Ei '^\s*function\s+[a-zA-Z_]' | grep -vi '^\s*end' | wc -l)
    procedures=$((subroutines + functions))

    # --------------------------------
    # Branch complexity proxy
    branches=$(echo "$clean_code" | grep -Eio '\b(if|elseif|else|select\s+case|case|do\s+|do\s+while|where|forall|cycle|exit)\b' | wc -l)

    # --------------------------------
    # I/O operations
    io_ops=$(echo "$clean_code" | grep -Eio '\b(read|write|open|close)\b' | wc -l)

    # --------------------------------
    # Derived metrics
    lines_per_sub=0
    branches_per_procedure=0

    if (( subroutines > 0 )); then
        lines_per_sub=$(awk -v code="$code" -v s="$subroutines" 'BEGIN{printf "%.1f", code/s}')
    fi

    if (( procedures > 0 )); then
        branches_per_procedure=$(awk -v b="$branches" -v p="$procedures" 'BEGIN{printf "%.1f", b/p}')
    fi

    echo -e "$code\t$total\t$modules\t$programs\t$subroutines\t$functions\t$branches\t$io_ops\t$lines_per_sub\t$branches_per_procedure"
}

# =============================================================================
# Function: gather_stats
#
# Traverse the current directory tree and analyse all Fortran source files.
#
# For each file the function:
#   - collects file-level metrics
#   - updates global counters
#   - accumulates directory-level statistics
#
# Results are stored in arrays for later formatting and printing.
# =============================================================================

gather_stats() {

    mapfile -t files < <(
    find . -type f \
    \( -iname "*.f90" -o -iname "*.F90" \
       -o -iname "*.f"   -o -iname "*.F" \
       -o -iname "*.for" -o -iname "*.f95" \
       -o -iname "*.inc" \)
    )

    if [[ ${#files[@]} -eq 0 ]]; then
        echo "No Fortran files found."
        exit 0
    fi

    for file in "${files[@]}"; do

        read code total modules programs subs funcs branches io_ops lines_per_sub branches_per_procedure <<< \
            "$(count_fortran_metrics "$file")"

        (( total == 0 || code == 0 )) && continue

        percent_file=$(awk "BEGIN {printf \"%.1f\", ($code/$total)*100}")

        files_stats+=("$code"$'\t'"$total"$'\t'"$modules"$'\t'"$programs"$'\t'"$subs"$'\t'"$funcs"$'\t'"$branches"$'\t'"$io_ops"$'\t'"$lines_per_sub"$'\t'"$branches_per_procedure"$'\t'"$file")

        total_all=$((total_all + total))
        total_code=$((total_code + code))
        ((file_count++))

        [[ -z "$min_lines" || total -lt min_lines ]] && min_lines=$total
        [[ -z "$max_lines" || total -gt max_lines ]] && max_lines=$total
        sum_lines=$((sum_lines + total))

        directory=$(dirname "$file")

        directory_code["$directory"]=$(( ${directory_code["$directory"]:-0} + code ))
        directory_all["$directory"]=$(( ${directory_all["$directory"]:-0} + total ))
        directory_subs["$directory"]=$(( ${directory_subs["$directory"]:-0} + subs ))
        directory_modules["$directory"]=$(( ${directory_modules["$directory"]:-0} + modules ))
        directory_programs["$directory"]=$(( ${directory_programs["$directory"]:-0} + programs ))
        directory_functions["$directory"]=$(( ${directory_functions["$directory"]:-0} + funcs ))
        directory_branches["$directory"]=$(( ${directory_branches["$directory"]:-0} + branches ))
        directory_io["$directory"]=$(( ${directory_io["$directory"]:-0} + io_ops ))

        directory_lines_per_sub_sum["$directory"]=$(awk \
            -v prev=${directory_lines_per_sub_sum["$directory"]:-0} \
            -v new="$lines_per_sub" \
            'BEGIN{printf "%.1f", prev+new}')

        directory_file_count["$directory"]=$(( ${directory_file_count["$directory"]:-0} + 1 ))
    done
}

# =============================================================================
# Function: print_global_stats
#
# Display overall statistics describing the analysed codebase.
#
# Reported metrics include:
#   - number of analysed files
#   - total number of lines
#   - minimum file size
#   - maximum file size
#   - average file size
# =============================================================================

print_global_stats() {

    avg_lines=0
    if (( file_count > 0 )); then
        avg_lines=$(awk -v s="$sum_lines" -v n="$file_count" 'BEGIN{printf "%.1f", s/n}')
    fi

    echo
    echo "GLOBAL STATISTICS"
    echo "-----------------"
    printf "%-20s %10d\n" "Files analysed:" "$file_count"
    printf "%-20s %10d\n" "Total lines:" "$total_all"
    printf "%-20s %10d\n" "Min file size:" "$min_lines"
    printf "%-20s %10d\n" "Max file size:" "$max_lines"
    printf "%-20s %10s\n" "Avg file size:" "$avg_lines"
}

# =============================================================================
# Function: print_file_table
#
# Print a formatted table containing statistics for each source file.
#
# Files are sorted by their percentage contribution to the total code
# base. The table includes structural metrics and derived indicators
# such as lines per subroutine and branching density.
# =============================================================================

print_file_table() {

    IFS=$'\n' sorted=($(for entry in "${files_stats[@]}"; do
        code=$(echo -e "$entry" | cut -f1)
        percent_total=$(awk "BEGIN {printf \"%.1f\", ($code/$total_code)*100}")
        echo -e "$percent_total\t$entry"
    done | sort -nr -k1,1))
    unset IFS

    printf "\n%-35s %6s %6s %6s %4s %4s %5s %5s %7s %5s %7s %7s\n" \
    "File" "Lines" "%Code" "%Tot" "Prog" "Mod" "Subs" "Func" "Branch" "IO" "L/Sub" "Br/Proc"

    printf "%-35s %6s %6s %6s %4s %4s %5s %5s %7s %5s %7s %7s\n" \
    "----" "-----" "-----" "-----" "----" "---" "----" "----" "------" "--" "-----" "-------"

    total_subs=0
    total_programs=0
    total_modules=0
    total_funcs=0
    total_branches=0
    total_io=0

    for line in "${sorted[@]}"; do

        percent_total=$(echo -e "$line" | cut -f1)
        code=$(echo -e "$line" | cut -f2)
        total=$(echo -e "$line" | cut -f3)
        modules=$(echo -e "$line" | cut -f4)
        programs=$(echo -e "$line" | cut -f5)
        subs=$(echo -e "$line" | cut -f6)
        funcs=$(echo -e "$line" | cut -f7)
        branches=$(echo -e "$line" | cut -f8)
        io_ops=$(echo -e "$line" | cut -f9)
        lines_per_sub=$(echo -e "$line" | cut -f10)
        branches_per_procedure=$(echo -e "$line" | cut -f11)
        file=$(echo -e "$line" | cut -f12-)
        percent_code=$(awk "BEGIN {printf \"%.1f\", ($code/$total)*100}")

        if [ ${#file} -gt 35 ]; then
            file="...${file: -32}"
        fi

        total_subs=$(( total_subs + subs ))
        total_programs=$(( total_programs + programs ))
        total_modules=$(( total_modules + modules ))
        total_funcs=$(( total_funcs + funcs ))
        total_branches=$(( total_branches + branches ))
        total_io=$(( total_io + io_ops ))

        printf "%-35s %6d %6s %6s %4d %4d %5d %5d %7d %5d %7s %7s\n" \
        "$file" "$total" "$percent_code" "$percent_total" \
        "$programs" "$modules" "$subs" "$funcs" "$branches" "$io_ops" \
        "$lines_per_sub" "$branches_per_procedure"
    done

    total_percent_code=$(awk "BEGIN {printf \"%.1f\", ($total_code/$total_all)*100}")

    printf "%-35s %6d %6s %6s %4s %4s %5d %5s %7s %5s %7s %7s\n" \
    "TOTAL" "$total_all" "$total_percent_code" "100.0" \
    "$total_programs" "$total_modules" "$total_subs" "$total_funcs" "$total_branches" "$total_io" "-" "-"
}

# =============================================================================
# Function: print_directory_table
#
# Print aggregated statistics for each directory containing
# Fortran source files.
#
# Metrics include total code lines, total lines, number of
# subroutines, average lines per subroutine, and percentage
# of the total code base.
# =============================================================================

print_directory_table() {

    printf "\n%-35s %12s %12s %8s %13s %12s\n" \
        "Directory" "Code Lines" "All Lines" "Subs" "Avg Lines/Sub" "% of Total"

    printf "%-35s %12s %12s %8s %13s %12s\n" \
        "---------" "----------" "---------" "----" "-------------" "----------"

    total_subs=0

    for directory in "${!directory_code[@]}"; do

        code=${directory_code["$directory"]}
        total=${directory_all["$directory"]}
        subs=${directory_subs["$directory"]}

        total_subs=$(( total_subs + subs ))

        lines_per_sub_sum=${directory_lines_per_sub_sum["$directory"]}

        avg_lines_sub=0
        if (( subs > 0 )); then
            avg_lines_sub=$(awk -v sum="$lines_per_sub_sum" -v subs="$subs" \
                'BEGIN{printf "%.1f", sum/subs}')
        fi

        percent_total=$(awk "BEGIN {printf \"%.1f\", ($code/$total_code)*100}")

        display_directory="$directory"
        if [ ${#directory} -gt 35 ]; then
            display_directory="...${directory: -32}"
        fi

        printf "%-35s %12d %12d %8d %13s %11s%%\n" \
            "$display_directory" "$code" "$total" "$subs" "$avg_lines_sub" "$percent_total"

    done | sort -nr -k6,6
}

# =============================================================================
# Function: write_csv
#
# Export collected statistics to CSV files for external analysis.
#
# Two files are produced:
#   code_filestats.csv
#       Per-file metrics.
#   code_dirstats.csv
#       Aggregated statistics by directory.
#
# These files can be imported into spreadsheet software or analysed
# using Python, R, or other data-analysis tools.
# =============================================================================

write_csv() {

    echo

    echo "File,Code,Total,%Code,%Total,Modules,Programs,Subroutines,Functions,Branches,IO,Lines/Sub,Branches/Proc"

    for entry in "${files_stats[@]}"; do

        code=$(echo -e "$entry" | cut -f1)
        total_lines=$(echo -e "$entry" | cut -f2)
        modules=$(echo -e "$entry" | cut -f3)
        programs=$(echo -e "$entry" | cut -f4)
        subs=$(echo -e "$entry" | cut -f5)
        funcs=$(echo -e "$entry" | cut -f6)
        branches=$(echo -e "$entry" | cut -f7)
        io_ops=$(echo -e "$entry" | cut -f8)
        lines_per_sub=$(echo -e "$entry" | cut -f9)
        branches_per_procedure=$(echo -e "$entry" | cut -f10)
        file=$(echo -e "$entry" | cut -f11-)

        percent_code=$(awk "BEGIN {printf \"%.1f\", ($code/$total_lines)*100}")
        percent_total=$(awk "BEGIN {printf \"%.1f\", ($code/$total_code)*100}")

        echo "\"$file\",$code,$total_lines,$percent_code,$percent_total,$modules,$programs,$subs,$funcs,$branches,$io_ops,$lines_per_sub,$branches_per_procedure" \
        >> "$csv_file"
    done

    echo "File stats CSV saved to $csv_file"

    echo "Directory,Code,Total,Modules,Programs,Subroutines,Functions,Branches,IO Ops" \
        > "$directory_csv"

    total_subs=0

    for directory in "${!directory_code[@]}"; do

        code=${directory_code["$directory"]}
        total=${directory_all["$directory"]}
        subs=${directory_subs["$directory"]}

        total_subs=$(( total_subs + subs ))

        lines_per_sub_sum=${directory_lines_per_sub_sum["$directory"]}

        avg_lines_sub=0
        if (( subs > 0 )); then
            avg_lines_sub=$(awk -v sum="$lines_per_sub_sum" -v subs="$subs" \
                'BEGIN{printf "%.1f", sum/subs}')
        fi

        percent_total=$(awk "BEGIN {printf \"%.1f\", ($code/$total_code)*100}")

        echo "\"$directory\",$code,$total,$subs,$avg_lines_sub,$percent_total" \
            >> "$directory_csv"
    done

    echo "\"TOTAL\",$total_code,$total_all,$total_subs,-,100%" >> "$directory_csv"

    echo "Directory stats CSV saved to $directory_csv"
}

# =============================================================================
# Main program
# =============================================================================

gather_stats
print_global_stats
print_file_table
print_directory_table

if $WRITE_CSV; then
    write_csv
fi
