From 73ca4a280448ea2bbc8c73dc069e8b7d74c9384a Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Fri, 20 Jun 2025 17:45:00 +0200 Subject: [PATCH 1/4] Rename lstraverse.sh -> create_filetree.sh --- tools/{lstraverse.sh => create_filetree.sh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tools/{lstraverse.sh => create_filetree.sh} (100%) diff --git a/tools/lstraverse.sh b/tools/create_filetree.sh similarity index 100% rename from tools/lstraverse.sh rename to tools/create_filetree.sh -- GitLab From f60bcba2ea1ef969d78399b136dbd3ac3193f94d Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Fri, 20 Jun 2025 17:44:37 +0200 Subject: [PATCH 2/4] Rename lstraverse.sh -> create_filetree.sh --- destine_analytics/scripts/draw_filetree.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 2c12b55..350b273 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -20,7 +20,7 @@ df = None def _get_parser(): parser = ArgumentParser( - description="Parser for output of `tools/lstraverse.sh`.", + description="Parser for output of `tools/create_filetree.sh`.", formatter_class=RawDescriptionHelpFormatter, ) parser.add_argument( @@ -28,7 +28,7 @@ def _get_parser(): type=str, help=( "If it is a directory, local (or remote) path to draw filetree from. " - "If it is a file, it is assumed to be the output of tools/lstraverse.sh." + "If it is a file, it is assumed to be the output of tools/create_filetree.sh." ), ) parser.add_argument( @@ -130,7 +130,7 @@ def _cache_write(cache, obj, name, /, *args, **kwargs): def _add_path_column(df): - # For lstraverse.sh output, Parent already contains the full directory path + # For create_filetree.sh output, Parent already contains the full directory path # We don't need to normalize it further, just create the full file paths df["Path"] = (df["Parent"] + "/" + df["Filename"]).apply( lambda x: x.removeprefix("/") @@ -139,7 +139,7 @@ def _add_path_column(df): def _get_subtree_df(df, root): - # For lstraverse.sh output, Parent contains full directory paths + # For create_filetree.sh output, Parent contains full directory paths df = df.copy() # Normalize requested root -- GitLab From 55324bca2d1260845d05b6616d4358a60ed95905 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Fri, 20 Jun 2025 17:47:32 +0200 Subject: [PATCH 3/4] Update README.md --- README.md | 436 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 380 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 5b00921..66d9d83 100644 --- a/README.md +++ b/README.md @@ -1,92 +1,416 @@ -# DestinE-Analytics +# DestinE Analytics +A comprehensive analytics toolkit for data analysis, visualization, and log processing, specifically designed for AutoSubmit (AS) workflow analysis and file system exploration. +## Overview -## Getting started +DestinE Analytics provides powerful tools for: +- **File System Analysis**: Generate and visualize file trees with size analysis +- **Log Processing**: Parse and analyze AutoSubmit logs for workflow monitoring +- **Gantt Chart Generation**: Create detailed timeline visualizations of job execution +- **Data Visualization**: Interactive plots using both Matplotlib and Plotly -To make it easy for you to get started with GitLab, here's a list of recommended next steps. +## Features -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! +### 🗂️ File Tree Analysis +- Generate comprehensive file tree listings with metadata +- Interactive file size visualizations (treemap, sunburst, icicle charts) +- Support for remote file system analysis via SSH +- Caching system for improved performance -## Add your files +### 📊 AutoSubmit Log Analysis +- Parse AutoSubmit log files and extract job execution data +- Generate Gantt charts showing job timelines and dependencies +- Track job status, completion rates, and failure analysis +- Support for ensemble runs and chunk-based workflows -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: +### 📈 Data Visualization +- Multiple chart types: treemap, sunburst, icicle, Gantt charts +- Interactive Plotly visualizations +- Matplotlib-based static plots +- Customizable styling and color schemes +### 🔧 Utility Tools +- Shell scripts for file tree generation +- Log transfer utilities for remote systems +- Caching mechanisms for performance optimization + +## Installation + +### Prerequisites +- Python 3.8 or higher +- Bash shell (for shell scripts) + +### Setup + +1. **Clone the repository**: + ```bash + git clone + cd destine_analytics + ``` + +2. **Create a virtual environment**: + ```bash + python -m venv .venv + source .venv/bin/activate # On Windows: .venv\Scripts\activate + ``` + +3. **Install dependencies**: + ```bash + pip install -r requirements.txt + ``` + +4. **Install the package**: + ```bash + pip install -e . + ``` + +5. **Configure environment** (optional): + ```bash + cp .env.example .env + # Edit .env with your configuration + ``` + +## Usage + +### Command Line Tools + +**💡 Tip**: Use `-h` or `--help` with any command to see all available options and detailed usage information. + +#### 1. File Tree Visualization (`draw-filetree`) + +Generate interactive file tree visualizations from directories or pre-generated file listings. + +```bash +# See all available options +draw-filetree -h + +# Analyze a local directory +draw-filetree /path/to/directory --figure treemap --interactive + +# Use pre-generated file tree output +draw-filetree tools/ls_filetree_Documents.txt --figure sunburst + +# Remote directory analysis +draw-filetree /remote/path --ssh-command "ssh user@host" --figure icicle + +# Save output +draw-filetree /path/to/directory --figure all --output figures/ ``` -cd existing_repo -git remote add origin https://earth.bsc.es/gitlab/ialsina/destine_analytics.git -git branch -M main -git push -uf origin main + +**Options**: +- `--figure`: Chart type (`treemap`, `sunburst`, `icicle`, `all`) +- `--interactive`: Show interactive plot +- `--only-directories`: Show only directories (for large trees) +- `--root`: Specify root directory for subtree analysis +- `--cache`: Enable caching for performance +- `--output`: Output directory for saved figures + +#### 2. Gantt Chart Generation (`create-ganttchart`) + +Create Gantt charts from AutoSubmit logs showing job execution timelines. + +**Note**: Currently, running `tools/create_filetree.sh` is a requirement for `create-ganttchart` to function properly. + +```bash +# See all available options +create-ganttchart -h + +# Basic Gantt chart +create-ganttchart o005 --interactive + +# Filter by chunk range +create-ganttchart o005 --chunks 38 52 --ignore AQUA + +# Color by different attributes +create-ganttchart o005 --color job --starttype submitted + +# Add statistical information +create-ganttchart o005 --add-information stats --durations + +# Save with custom settings +create-ganttchart o005 --output figures/ --dpi 300 ``` -## Integrate with your tools +**Options**: +- `--chunks`: Specify chunk range to analyze +- `--ignore`: Ignore jobs with specific prefixes +- `--color`: Color scheme (`job`, `chunk`, `status`) +- `--starttype`: Use `started` or `submitted` timestamps +- `--add-information`: Add extra info (`available`, `stats`) +- `--durations`: Show job durations +- `--pivot`: Organize by chunk in y-axis -- [ ] [Set up project integrations](https://earth.bsc.es/gitlab/ialsina/destine_analytics/-/settings/integrations) +### Shell Tools -## Collaborate with your team +#### 1. File Tree Generation (`tools/create_filetree.sh`) -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Automatically merge when pipeline succeeds](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) +Generate comprehensive file listings with metadata. -## Test and Deploy +```bash +# Generate file tree listing +./tools/create_filetree.sh /path/to/directory -Use the built-in continuous integration in GitLab. +# Output will be saved as ls_filetree_.txt +``` -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing(SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) +#### 2. Log Transfer (`tools/destine-vm-log-transfer.sh`) -*** +Transfer logs from remote AutoSubmit systems. -# Editing this README +```bash +# Transfer logs for experiment +./tools/destine-vm-log-transfer.sh o005 --output /local/path +``` -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thank you to [makeareadme.com](https://www.makeareadme.com/) for this template. +**Requirements**: Configure `.env` file with SSH credentials and paths. -## Suggestions for a good README -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. +### Python API -## Name -Choose a self-explaining name for your project. +#### File Tree Analysis -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. +```python +from destine_analytics.lsreport import read_filetree +from destine_analytics.scripts.draw_filetree import draw -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. +# Read file tree data +df = read_filetree("ls_filetree_output.txt") -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. +# Create visualization +sizes = df.groupby("Parent")["Filesize"].sum().to_dict() +parents = {row["Filename"]: row["Parent"] for _, row in df.iterrows()} -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. +draw(sizes, parents, figure="treemap", interactive=True) +``` -## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. +#### Log Analysis -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. +```python +from destine_analytics.logreport import get_aslogs_df, jobs_df_from_aslogs_df +from destine_analytics.plots import plot_job_gantt_chart -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. +# Parse AutoSubmit logs +aslogs_df = get_aslogs_df(Path("path/to/aslogs")) +jobs_df = jobs_df_from_aslogs_df(aslogs_df, Path("path/to/aslogs")) -## Contributing -State if you are open to contributions and what your requirements are for accepting them. +# Create Gantt chart +fig = plot_job_gantt_chart( + jobs_df, + title="Job Execution Timeline", + color_key="job", + pivot="on_overlap" +) +``` -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. +#### Plotting Decorators -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. +```python +from destine_analytics.decorators.asrun import asrun_with_stats + +# Add statistical information to plots +@asrun_with_stats(log_path=Path("logs"), after=datetime(2024, 1, 1)) +def my_plot_function(df): + # Your plotting code here + pass +``` + +## Configuration + +### Environment Variables + +Create a `.env` file by copying from the provided template: + +```bash +# Copy the example configuration +cp .env.example .env + +# Edit .env with your specific configuration +``` + +The `.env` file should contain the following variables: + +```bash +# Log directory path +LOG_DIR=/path/to/logs + +# SSH configuration for remote access +file1=/path/to/ssh/key1 +file2=/path/to/ssh/key2 +user1=username1 +user2=username2 +machine1=machine1.example.com +machine2=machine2.example.com +logdir=/local/log/directory +``` -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. +### Cache Configuration + +The package uses a caching system located at `~/.cache/` to improve performance for repeated operations. Cache files are automatically managed and can be cleared manually if needed. + +## Project Structure + +``` +destine_analytics/ +├── destine_analytics/ # Main package +│ ├── scripts/ # Command-line entry points +│ │ ├── draw_filetree.py # File tree visualization +│ │ ├── create_ganttchart.py # Gantt chart generation +│ │ └── utils.py # Shared utilities +│ ├── decorators/ # Plotting decorators +│ │ └── asrun.py # AutoSubmit run decorators +│ ├── logreport.py # Log parsing and analysis +│ ├── lsreport.py # File tree parsing +│ ├── plots.py # Visualization functions +│ └── paths.py # Path configuration +├── tools/ # Shell utilities +│ ├── create_filetree.sh # File tree generation +│ └── destine-vm-log-transfer.sh # Log transfer utility +├── requirements.txt # Python dependencies +├── pyproject.toml # Package configuration +└── README.md # This file +``` + +## Dependencies + +### Core Dependencies +- **matplotlib**: Static plotting and visualization +- **plotly**: Interactive visualizations +- **pandas**: Data manipulation and analysis +- **numpy**: Numerical computations +- **tqdm**: Progress bars for long operations + +### Optional Dependencies +- **pytest**: Testing framework +- **black**: Code formatting +- **isort**: Import sorting +- **flake8**: Code linting + +## Development + +### Setting up Development Environment + +```bash +# Install development dependencies +pip install -e ".[dev]" + +# Run tests +pytest + +# Format code +black destine_analytics/ +isort destine_analytics/ + +# Lint code +flake8 destine_analytics/ +``` + +### Adding New Features + +1. **Scripts**: Add new command-line tools in `destine_analytics/scripts/` +2. **Core Functions**: Add utility functions in appropriate modules +3. **Visualizations**: Add new plotting functions in `plots.py` +4. **Decorators**: Add new decorators in `destine_analytics/decorators/` + +### Testing + +```bash +# Run all tests +pytest + +# Run with coverage +pytest --cov=destine_analytics + +# Run specific test file +pytest tests/test_logreport.py +``` + +## Examples + +### Example 1: File System Analysis + +```bash +# Generate file tree for a large directory +./tools/create_filetree.sh /home/user/documents + +# Create interactive treemap visualization +draw-filetree tools/ls_filetree_documents.txt \ + --figure treemap \ + --interactive \ + --output figures/ +``` + +### Example 2: Workflow Analysis + +```bash +# Create Gantt chart for AutoSubmit experiment +create-ganttchart o005 \ + --chunks 1 50 \ + --color chunk \ + --add-information stats \ + --durations \ + --output figures/ +``` + +### Example 3: Remote Analysis + +```bash +# Analyze remote file system +draw-filetree /remote/path \ + --ssh-command "ssh user@remote-host" \ + --figure sunburst \ + --cache \ + --output remote_analysis/ +``` + +## Troubleshooting + +### Common Issues + +1. **Permission Denied**: Ensure shell scripts are executable + ```bash + chmod +x tools/*.sh + ``` + +2. **SSH Connection Issues**: Verify SSH configuration in `.env` file + +3. **Memory Issues**: Use `--only-directories` flag for large file trees + +4. **Cache Issues**: Clear cache directory if needed + ```bash + rm -rf ~/.cache/draw_filetree/ + ``` + +### Performance Tips + +- Use caching for repeated operations +- Filter large datasets with `--chunks` or `--root` +- Use `--only-directories` for very large file trees +- Consider using remote analysis for large remote directories + +## Contributing + +1. Fork the repository +2. Create a feature branch +3. Make your changes +4. Add tests for new functionality +5. Ensure code passes linting and formatting +6. Submit a pull request ## License -For open source projects, say how it is licensed. -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +[Add your license information here] + +## Support + +For issues and questions: +- Create an issue in the repository +- Check the troubleshooting section +- Review the examples for usage patterns + +## Roadmap + +- [ ] Add support for more visualization types +- [ ] Implement real-time log monitoring +- [ ] Add web-based dashboard +- [ ] Support for more log formats +- [ ] Enhanced caching and performance optimizations -- GitLab From 572dbf0ec3b3df5400236c3b0b2db4983f776d31 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Fri, 20 Jun 2025 17:51:47 +0200 Subject: [PATCH 4/4] Add: `-e` option raises `NotImplementedError` for ganttchart creation --- destine_analytics/scripts/create_ganttchart.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/destine_analytics/scripts/create_ganttchart.py b/destine_analytics/scripts/create_ganttchart.py index cee7394..013582a 100755 --- a/destine_analytics/scripts/create_ganttchart.py +++ b/destine_analytics/scripts/create_ganttchart.py @@ -47,6 +47,14 @@ def _get_parser() -> ArgumentParser: choices=get_available_expids(), help="Experiment id", ) + parser.add_argument( + "-e", + "--ssh-command", + type=str, + help=( + "For remote path scoping: ssh command to execute in order to access to remote shell." + ), + ) parser.add_argument( "--chunks", type=int, @@ -215,6 +223,7 @@ def main(): args = _get_parser().parse_args() expid = args.expid + ssh_command = args.ssh_command chunks0, chunks1 = args.chunks ignore = args.ignore color = args.color @@ -238,6 +247,12 @@ def main(): if not interactive and not output: raise ValueError("Interactive mode is required when output is not provided.") + if ssh_command: + raise NotImplementedError( + "SSH command is not yet implemented. Please, use the `destine-vm-log-transfer.sh` " + "script to transfer the logs to the local machine." + ) + if by_member and not by_split: warnings.warn( "Independent pivots are not yet implemented. " -- GitLab