From 8552d4ecc29e08cc763c0af8a5a044133dd9fc22 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Mon, 23 Jun 2025 16:25:40 +0200 Subject: [PATCH 1/7] Fix: Directory size double computation --- destine_analytics/scripts/draw_filetree.py | 91 +++++++++++----------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 350b273..50673fc 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -202,6 +202,7 @@ def draw( root (str | None): Subdirectory to start visualization from. Can be absolute or relative to the common root. figure (str): One of 'treemap', 'sunburst', or 'icicle'. """ + if figure not in {"treemap", "sunburst", "icicle"}: raise ValueError( f"Unsupported figure '{figure}'. Use 'treemap', 'sunburst', or 'icicle'." @@ -368,54 +369,51 @@ def compute_directory_size(df, only_directories=False): if df.empty: return {}, {} - # Initialize dictionaries - direct_sizes = {} # Size of files directly in each directory - total_sizes = {} # Total size including subdirectories + total_sizes = {} parents = {} - # First pass: compute direct sizes and parent relationships - # Start with directories - for directory in tqdm(sorted(df["Parent"].unique())): - # Get files directly in this directory - mask = df["Parent"] == directory - direct_sizes[directory] = int(df.loc[mask, "Filesize"].sum()) - total_sizes[directory] = direct_sizes[directory] # Initialize total size - - # Set parent relationship based on actual directory structure - if directory == "/" or directory == "": - parents[directory] = "" - else: - # Find the parent directory by going up one level - parent_dir = os.path.dirname(directory) - if parent_dir == directory: # Root directory - parents[directory] = "" - else: - parents[directory] = parent_dir - - # Add files if not only_directories - if not only_directories: - for _, row in tqdm(df.iterrows(), total=len(df)): - file_path = f"{row['Parent']}/{row['Filename']}" - if ( - file_path not in direct_sizes - ): # Only add if not already added as a directory - direct_sizes[file_path] = int(row["Filesize"]) - total_sizes[file_path] = direct_sizes[file_path] - parents[file_path] = row["Parent"] - - # Second pass: propagate sizes up the tree - # Process paths from longest to shortest - for path in sorted(direct_sizes.keys(), key=len, reverse=True): - parent = parents[path] - if parent and parent in total_sizes: # If not root and parent exists + # Step 1: Process all files, recording their size and parent. + for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing files"): + parent_path = row["Parent"] + file_path = os.path.join(parent_path, row["Filename"]) + total_sizes[file_path] = int(row["Filesize"]) + parents[file_path] = parent_path + + # Step 2: Ensure all directories and their ancestors are in the data structures. + all_paths = set(parents.keys()) + for path in all_paths: + parent = parents.get(path) + while parent and parent not in parents: + if parent not in total_sizes: + total_sizes[parent] = 0 + + grandparent = os.path.dirname(parent) + if grandparent == parent: # Reached the root + parents[parent] = "" + break + parents[parent] = grandparent + parent = grandparent + + # Step 3: Propagate sizes up the tree from the longest paths to the shortest. + for path in sorted(total_sizes.keys(), key=len, reverse=True): + parent = parents.get(path) + if parent is not None and parent in total_sizes: total_sizes[parent] += total_sizes[path] - # Ensure we have at least one root element - if not any(parent == "" for parent in parents.values()): - # Find the shortest path as root - if direct_sizes: - shortest_path = min(direct_sizes.keys(), key=len) - parents[shortest_path] = "" + # If only directories are requested, filter out the files. + if only_directories: + # A directory is any path that is a parent of another path. + dir_paths = set(parents.values()) + + # Filter total_sizes and parents to only include directories. + total_sizes = {path: size for path, size in total_sizes.items() if path in dir_paths} + parents = {path: parent for path, parent in parents.items() if path in dir_paths} + + # Any node whose parent is not in the dataset is a root. + all_paths = set(total_sizes.keys()) + for path, parent in parents.items(): + if parent not in all_paths: + parents[path] = "" return total_sizes, parents @@ -461,9 +459,9 @@ def _get_dataframe(path_str, ssh_command, expid, cache): def main(): - global df + args = _get_parser().parse_args() - path_str = args.path + path_str = os.path.normpath(args.path) ssh_command = args.ssh_command expid = args.expid root = args.root @@ -484,6 +482,7 @@ def main(): df = _get_dataframe(path_str, ssh_command, expid, cache) _cache_write(cache, df, "df", **cache_key_args) + df["Parent"] = df["Parent"].apply(os.path.normpath) df = _add_path_column(df) df = _get_subtree_df(df, root) -- GitLab From 47d12a42ccce3b49259b80b04b792f373ff90cf0 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Mon, 23 Jun 2025 16:26:25 +0200 Subject: [PATCH 2/7] Fix: Visualization tree root --- destine_analytics/scripts/draw_filetree.py | 50 ++++++++++++---------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 50673fc..3d248dd 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -15,8 +15,6 @@ import plotly.express as px from destine_analytics.lsreport import read_filetree from .utils import sanitize_output -df = None - def _get_parser(): parser = ArgumentParser( @@ -50,8 +48,8 @@ def _get_parser(): "-r", "--root", type=str, - default="", - help="Root directory to display. Must be within the file tree examined.", + default=None, + help="Root directory to display. Defaults to the main path.", ) parser.add_argument( "-d", @@ -132,9 +130,7 @@ def _cache_write(cache, obj, name, /, *args, **kwargs): def _add_path_column(df): # For create_filetree.sh output, Parent already contains the full directory path # We don't need to normalize it further, just create the full file paths - df["Path"] = (df["Parent"] + "/" + df["Filename"]).apply( - lambda x: x.removeprefix("/") - ) + df["Path"] = df.apply(lambda row: os.path.join(row["Parent"], row["Filename"]), axis=1) return df @@ -156,7 +152,12 @@ def _get_subtree_df(df, root): abs_root = os.path.normpath(os.path.join(common_root, root)) # Filter to subtree - only include files that are under the absolute root - mask = df["Parent"].str.startswith(abs_root) + if abs_root == "/": + mask = df["Parent"].str.startswith("/") + else: + mask = (df["Parent"] == abs_root) | ( + df["Parent"].str.startswith(abs_root + "/") + ) subtree_df = df[mask].copy() if subtree_df.empty: @@ -226,13 +227,22 @@ def draw( # Add human readable size for hover viz_df["size_str"] = viz_df["value"].apply(_human_readable_size) - # Filter by root if specified + # If a root is specified, filter the data to start from there. if root: - mask = viz_df["id"].str.startswith(root) + # This is safer than `startswith` which can match `/path/to/dir-other` + if root == "/": + mask = viz_df["id"].str.startswith("/") + else: + mask = (viz_df["id"] == root) | viz_df["id"].str.startswith(root + "/") viz_df = viz_df[mask] + if viz_df.empty: raise ValueError(f"No entries found under root '{root}'.") + # Any node whose parent is not in the dataset is a root. + all_ids = set(viz_df["id"]) + viz_df.loc[~viz_df["parent"].isin(all_ids), "parent"] = "" + # Create the figure fig = getattr(px, figure)( viz_df, @@ -247,19 +257,13 @@ def draw( ) # Get the root for the title - if root: - abs_root = root + root_entries = viz_df[viz_df["parent"] == ""] + if not root_entries.empty: + abs_root = root_entries["id"].iloc[0] + elif not viz_df.empty: + abs_root = viz_df["id"].iloc[0] else: - # Try to find the root element (one with empty parent) - root_entries = viz_df[viz_df["parent"] == ""] - if not root_entries.empty: - abs_root = root_entries["id"].iloc[0] - else: - # Fallback: use the first entry or a default - if not viz_df.empty: - abs_root = viz_df["id"].iloc[0] - else: - abs_root = "unknown" + abs_root = root or "unknown" # Update traces for better visibility fig.update_traces( @@ -464,7 +468,7 @@ def main(): path_str = os.path.normpath(args.path) ssh_command = args.ssh_command expid = args.expid - root = args.root + root = os.path.normpath(args.root) if args.root is not None else path_str only_directories = args.only_directories figure = args.figure interactive = args.interactive -- GitLab From 144093d3565d7d58166c00ee66f8bc7847ad8bc5 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Mon, 23 Jun 2025 16:27:35 +0200 Subject: [PATCH 3/7] Style: Cosmetics --- destine_analytics/scripts/draw_filetree.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 3d248dd..173d96d 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -7,9 +7,10 @@ import os import shlex import subprocess from io import StringIO +from typing import Mapping + from tqdm import tqdm import pandas as pd -from typing import Mapping import plotly.express as px from destine_analytics.lsreport import read_filetree @@ -130,7 +131,9 @@ def _cache_write(cache, obj, name, /, *args, **kwargs): def _add_path_column(df): # For create_filetree.sh output, Parent already contains the full directory path # We don't need to normalize it further, just create the full file paths - df["Path"] = df.apply(lambda row: os.path.join(row["Parent"], row["Filename"]), axis=1) + df["Path"] = df.apply( + lambda row: os.path.join(row["Parent"], row["Filename"]), axis=1 + ) return df @@ -410,8 +413,12 @@ def compute_directory_size(df, only_directories=False): dir_paths = set(parents.values()) # Filter total_sizes and parents to only include directories. - total_sizes = {path: size for path, size in total_sizes.items() if path in dir_paths} - parents = {path: parent for path, parent in parents.items() if path in dir_paths} + total_sizes = { + path: size for path, size in total_sizes.items() if path in dir_paths + } + parents = { + path: parent for path, parent in parents.items() if path in dir_paths + } # Any node whose parent is not in the dataset is a root. all_paths = set(total_sizes.keys()) -- GitLab From 3b4e86d1f036a043ebfd37e9a8618761273d7c55 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Mon, 23 Jun 2025 16:28:36 +0200 Subject: [PATCH 4/7] Refactor: Move function `compute_directory_size` --- destine_analytics/lsreport.py | 69 ++++++++++++++++++++++ destine_analytics/scripts/draw_filetree.py | 69 +--------------------- 2 files changed, 70 insertions(+), 68 deletions(-) diff --git a/destine_analytics/lsreport.py b/destine_analytics/lsreport.py index 6ca20d9..e335203 100644 --- a/destine_analytics/lsreport.py +++ b/destine_analytics/lsreport.py @@ -105,3 +105,72 @@ def read_filetree(file: Union[str, Path, StringIO], common: bool = True) -> pd.D lambda x: x.removeprefix(value).removeprefix("/") ) return df + + +def compute_directory_size(df, only_directories=False): + """ + Compute directory sizes and their hierarchical relationships. + + Parameters: + df (pd.DataFrame): DataFrame with 'Parent' and 'Filename' columns + only_directories (bool): If True, only show directories in the visualization + + Returns: + tuple: (sizes, parents) where: + - sizes: dict mapping paths to their total sizes + - parents: dict mapping paths to their parent directories + """ + if df.empty: + return {}, {} + + total_sizes = {} + parents = {} + + # Step 1: Process all files, recording their size and parent. + for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing files"): + parent_path = row["Parent"] + file_path = os.path.join(parent_path, row["Filename"]) + total_sizes[file_path] = int(row["Filesize"]) + parents[file_path] = parent_path + + # Step 2: Ensure all directories and their ancestors are in the data structures. + all_paths = set(parents.keys()) + for path in all_paths: + parent = parents.get(path) + while parent and parent not in parents: + if parent not in total_sizes: + total_sizes[parent] = 0 + + grandparent = os.path.dirname(parent) + if grandparent == parent: # Reached the root + parents[parent] = "" + break + parents[parent] = grandparent + parent = grandparent + + # Step 3: Propagate sizes up the tree from the longest paths to the shortest. + for path in sorted(total_sizes.keys(), key=len, reverse=True): + parent = parents.get(path) + if parent is not None and parent in total_sizes: + total_sizes[parent] += total_sizes[path] + + # If only directories are requested, filter out the files. + if only_directories: + # A directory is any path that is a parent of another path. + dir_paths = set(parents.values()) + + # Filter total_sizes and parents to only include directories. + total_sizes = { + path: size for path, size in total_sizes.items() if path in dir_paths + } + parents = { + path: parent for path, parent in parents.items() if path in dir_paths + } + + # Any node whose parent is not in the dataset is a root. + all_paths = set(total_sizes.keys()) + for path, parent in parents.items(): + if parent not in all_paths: + parents[path] = "" + + return total_sizes, parents diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 173d96d..0637749 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -13,7 +13,7 @@ from tqdm import tqdm import pandas as pd import plotly.express as px -from destine_analytics.lsreport import read_filetree +from destine_analytics.lsreport import compute_directory_size, read_filetree from .utils import sanitize_output @@ -360,73 +360,6 @@ def makefigure( return fig -def compute_directory_size(df, only_directories=False): - """ - Compute directory sizes and their hierarchical relationships. - - Parameters: - df (pd.DataFrame): DataFrame with 'Parent' and 'Filename' columns - only_directories (bool): If True, only show directories in the visualization - - Returns: - tuple: (sizes, parents) where: - - sizes: dict mapping paths to their total sizes - - parents: dict mapping paths to their parent directories - """ - if df.empty: - return {}, {} - - total_sizes = {} - parents = {} - - # Step 1: Process all files, recording their size and parent. - for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing files"): - parent_path = row["Parent"] - file_path = os.path.join(parent_path, row["Filename"]) - total_sizes[file_path] = int(row["Filesize"]) - parents[file_path] = parent_path - - # Step 2: Ensure all directories and their ancestors are in the data structures. - all_paths = set(parents.keys()) - for path in all_paths: - parent = parents.get(path) - while parent and parent not in parents: - if parent not in total_sizes: - total_sizes[parent] = 0 - - grandparent = os.path.dirname(parent) - if grandparent == parent: # Reached the root - parents[parent] = "" - break - parents[parent] = grandparent - parent = grandparent - - # Step 3: Propagate sizes up the tree from the longest paths to the shortest. - for path in sorted(total_sizes.keys(), key=len, reverse=True): - parent = parents.get(path) - if parent is not None and parent in total_sizes: - total_sizes[parent] += total_sizes[path] - - # If only directories are requested, filter out the files. - if only_directories: - # A directory is any path that is a parent of another path. - dir_paths = set(parents.values()) - - # Filter total_sizes and parents to only include directories. - total_sizes = { - path: size for path, size in total_sizes.items() if path in dir_paths - } - parents = { - path: parent for path, parent in parents.items() if path in dir_paths - } - - # Any node whose parent is not in the dataset is a root. - all_paths = set(total_sizes.keys()) - for path, parent in parents.items(): - if parent not in all_paths: - parents[path] = "" - - return total_sizes, parents def _get_dataframe(path_str, ssh_command, expid, cache): -- GitLab From ecb86d89386ac0bb007dc43de9a21f473c6da1e8 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Mon, 23 Jun 2025 16:30:20 +0200 Subject: [PATCH 5/7] Refactor: Move function `compute_directory_size` --- destine_analytics/lsreport.py | 3 ++- destine_analytics/scripts/draw_filetree.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/destine_analytics/lsreport.py b/destine_analytics/lsreport.py index e335203..33a7aa5 100644 --- a/destine_analytics/lsreport.py +++ b/destine_analytics/lsreport.py @@ -7,6 +7,7 @@ import shlex from typing import Union import pandas as pd +from tqdm import tqdm # Regex to match the find -ls output line @@ -127,7 +128,7 @@ def compute_directory_size(df, only_directories=False): parents = {} # Step 1: Process all files, recording their size and parent. - for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing files"): + for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc="Processing files"): parent_path = row["Parent"] file_path = os.path.join(parent_path, row["Filename"]) total_sizes[file_path] = int(row["Filesize"]) diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 0637749..acfd52c 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -9,7 +9,6 @@ import subprocess from io import StringIO from typing import Mapping -from tqdm import tqdm import pandas as pd import plotly.express as px -- GitLab From 9adf449c8cb275b1e1c9a7aaa09c23593b48fab0 Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Mon, 23 Jun 2025 16:38:00 +0200 Subject: [PATCH 6/7] Fix: Arguments of figure type "sunburst" --- destine_analytics/scripts/draw_filetree.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index acfd52c..8de8141 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -268,12 +268,15 @@ def draw( abs_root = root or "unknown" # Update traces for better visibility - fig.update_traces( + update_kwargs = dict( root_color="lightgrey", - textposition="middle center", # Center text in each box textfont=dict(size=14), # Larger text textinfo="label", # Show the label (name) in each box ) + if figure != "sunburst": + # Center text in each box + update_kwargs["textposition"] = "middle center" + fig.update_traces(**update_kwargs) # Update layout for better visibility and larger size title_text = f"{figure.title()} of directory sizes under: {abs_root}" @@ -359,8 +362,6 @@ def makefigure( return fig - - def _get_dataframe(path_str, ssh_command, expid, cache): if ssh_command: find_cmd = f"find {shlex.quote(path_str)} -ls" -- GitLab From 6be169012754d6a8e6a9d14dd25a956fbfd271bd Mon Sep 17 00:00:00 2001 From: Ivan Alsina <1050-ialsina@users.noreply.bsc.es> Date: Mon, 23 Jun 2025 16:39:57 +0200 Subject: [PATCH 7/7] Update: Validate either output or interactive --- destine_analytics/scripts/draw_filetree.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 8de8141..1d7ba64 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -416,6 +416,9 @@ def main(): output_dir = args.output df = None + if not args.interactive and not args.output: + raise ValueError("Interactive mode is required when output is not provided.") + cache_key_args = { "path": path_str, "expid": expid, -- GitLab