diff --git a/destine_analytics/lsreport.py b/destine_analytics/lsreport.py index 6ca20d9c369322612312da6312cd4884542424a7..33a7aa50f3a977a94c0e23e369a5204a4a40c3fd 100644 --- a/destine_analytics/lsreport.py +++ b/destine_analytics/lsreport.py @@ -7,6 +7,7 @@ import shlex from typing import Union import pandas as pd +from tqdm import tqdm # Regex to match the find -ls output line @@ -105,3 +106,72 @@ def read_filetree(file: Union[str, Path, StringIO], common: bool = True) -> pd.D lambda x: x.removeprefix(value).removeprefix("/") ) return df + + +def compute_directory_size(df, only_directories=False): + """ + Compute directory sizes and their hierarchical relationships. + + Parameters: + df (pd.DataFrame): DataFrame with 'Parent' and 'Filename' columns + only_directories (bool): If True, only show directories in the visualization + + Returns: + tuple: (sizes, parents) where: + - sizes: dict mapping paths to their total sizes + - parents: dict mapping paths to their parent directories + """ + if df.empty: + return {}, {} + + total_sizes = {} + parents = {} + + # Step 1: Process all files, recording their size and parent. + for _, row in tqdm(df.iterrows(), total=len(df), leave=False, desc="Processing files"): + parent_path = row["Parent"] + file_path = os.path.join(parent_path, row["Filename"]) + total_sizes[file_path] = int(row["Filesize"]) + parents[file_path] = parent_path + + # Step 2: Ensure all directories and their ancestors are in the data structures. + all_paths = set(parents.keys()) + for path in all_paths: + parent = parents.get(path) + while parent and parent not in parents: + if parent not in total_sizes: + total_sizes[parent] = 0 + + grandparent = os.path.dirname(parent) + if grandparent == parent: # Reached the root + parents[parent] = "" + break + parents[parent] = grandparent + parent = grandparent + + # Step 3: Propagate sizes up the tree from the longest paths to the shortest. + for path in sorted(total_sizes.keys(), key=len, reverse=True): + parent = parents.get(path) + if parent is not None and parent in total_sizes: + total_sizes[parent] += total_sizes[path] + + # If only directories are requested, filter out the files. + if only_directories: + # A directory is any path that is a parent of another path. + dir_paths = set(parents.values()) + + # Filter total_sizes and parents to only include directories. + total_sizes = { + path: size for path, size in total_sizes.items() if path in dir_paths + } + parents = { + path: parent for path, parent in parents.items() if path in dir_paths + } + + # Any node whose parent is not in the dataset is a root. + all_paths = set(total_sizes.keys()) + for path, parent in parents.items(): + if parent not in all_paths: + parents[path] = "" + + return total_sizes, parents diff --git a/destine_analytics/scripts/draw_filetree.py b/destine_analytics/scripts/draw_filetree.py index 350b273f1fd5b0f1178350916ceec072bf1b3d17..1d7ba64320d00c56b5662646e3ada898822b9432 100644 --- a/destine_analytics/scripts/draw_filetree.py +++ b/destine_analytics/scripts/draw_filetree.py @@ -7,16 +7,14 @@ import os import shlex import subprocess from io import StringIO -from tqdm import tqdm -import pandas as pd from typing import Mapping + +import pandas as pd import plotly.express as px -from destine_analytics.lsreport import read_filetree +from destine_analytics.lsreport import compute_directory_size, read_filetree from .utils import sanitize_output -df = None - def _get_parser(): parser = ArgumentParser( @@ -50,8 +48,8 @@ def _get_parser(): "-r", "--root", type=str, - default="", - help="Root directory to display. Must be within the file tree examined.", + default=None, + help="Root directory to display. Defaults to the main path.", ) parser.add_argument( "-d", @@ -132,8 +130,8 @@ def _cache_write(cache, obj, name, /, *args, **kwargs): def _add_path_column(df): # For create_filetree.sh output, Parent already contains the full directory path # We don't need to normalize it further, just create the full file paths - df["Path"] = (df["Parent"] + "/" + df["Filename"]).apply( - lambda x: x.removeprefix("/") + df["Path"] = df.apply( + lambda row: os.path.join(row["Parent"], row["Filename"]), axis=1 ) return df @@ -156,7 +154,12 @@ def _get_subtree_df(df, root): abs_root = os.path.normpath(os.path.join(common_root, root)) # Filter to subtree - only include files that are under the absolute root - mask = df["Parent"].str.startswith(abs_root) + if abs_root == "/": + mask = df["Parent"].str.startswith("/") + else: + mask = (df["Parent"] == abs_root) | ( + df["Parent"].str.startswith(abs_root + "/") + ) subtree_df = df[mask].copy() if subtree_df.empty: @@ -202,6 +205,7 @@ def draw( root (str | None): Subdirectory to start visualization from. Can be absolute or relative to the common root. figure (str): One of 'treemap', 'sunburst', or 'icicle'. """ + if figure not in {"treemap", "sunburst", "icicle"}: raise ValueError( f"Unsupported figure '{figure}'. Use 'treemap', 'sunburst', or 'icicle'." @@ -225,13 +229,22 @@ def draw( # Add human readable size for hover viz_df["size_str"] = viz_df["value"].apply(_human_readable_size) - # Filter by root if specified + # If a root is specified, filter the data to start from there. if root: - mask = viz_df["id"].str.startswith(root) + # This is safer than `startswith` which can match `/path/to/dir-other` + if root == "/": + mask = viz_df["id"].str.startswith("/") + else: + mask = (viz_df["id"] == root) | viz_df["id"].str.startswith(root + "/") viz_df = viz_df[mask] + if viz_df.empty: raise ValueError(f"No entries found under root '{root}'.") + # Any node whose parent is not in the dataset is a root. + all_ids = set(viz_df["id"]) + viz_df.loc[~viz_df["parent"].isin(all_ids), "parent"] = "" + # Create the figure fig = getattr(px, figure)( viz_df, @@ -246,27 +259,24 @@ def draw( ) # Get the root for the title - if root: - abs_root = root + root_entries = viz_df[viz_df["parent"] == ""] + if not root_entries.empty: + abs_root = root_entries["id"].iloc[0] + elif not viz_df.empty: + abs_root = viz_df["id"].iloc[0] else: - # Try to find the root element (one with empty parent) - root_entries = viz_df[viz_df["parent"] == ""] - if not root_entries.empty: - abs_root = root_entries["id"].iloc[0] - else: - # Fallback: use the first entry or a default - if not viz_df.empty: - abs_root = viz_df["id"].iloc[0] - else: - abs_root = "unknown" + abs_root = root or "unknown" # Update traces for better visibility - fig.update_traces( + update_kwargs = dict( root_color="lightgrey", - textposition="middle center", # Center text in each box textfont=dict(size=14), # Larger text textinfo="label", # Show the label (name) in each box ) + if figure != "sunburst": + # Center text in each box + update_kwargs["textposition"] = "middle center" + fig.update_traces(**update_kwargs) # Update layout for better visibility and larger size title_text = f"{figure.title()} of directory sizes under: {abs_root}" @@ -352,74 +362,6 @@ def makefigure( return fig -def compute_directory_size(df, only_directories=False): - """ - Compute directory sizes and their hierarchical relationships. - - Parameters: - df (pd.DataFrame): DataFrame with 'Parent' and 'Filename' columns - only_directories (bool): If True, only show directories in the visualization - - Returns: - tuple: (sizes, parents) where: - - sizes: dict mapping paths to their total sizes - - parents: dict mapping paths to their parent directories - """ - if df.empty: - return {}, {} - - # Initialize dictionaries - direct_sizes = {} # Size of files directly in each directory - total_sizes = {} # Total size including subdirectories - parents = {} - - # First pass: compute direct sizes and parent relationships - # Start with directories - for directory in tqdm(sorted(df["Parent"].unique())): - # Get files directly in this directory - mask = df["Parent"] == directory - direct_sizes[directory] = int(df.loc[mask, "Filesize"].sum()) - total_sizes[directory] = direct_sizes[directory] # Initialize total size - - # Set parent relationship based on actual directory structure - if directory == "/" or directory == "": - parents[directory] = "" - else: - # Find the parent directory by going up one level - parent_dir = os.path.dirname(directory) - if parent_dir == directory: # Root directory - parents[directory] = "" - else: - parents[directory] = parent_dir - - # Add files if not only_directories - if not only_directories: - for _, row in tqdm(df.iterrows(), total=len(df)): - file_path = f"{row['Parent']}/{row['Filename']}" - if ( - file_path not in direct_sizes - ): # Only add if not already added as a directory - direct_sizes[file_path] = int(row["Filesize"]) - total_sizes[file_path] = direct_sizes[file_path] - parents[file_path] = row["Parent"] - - # Second pass: propagate sizes up the tree - # Process paths from longest to shortest - for path in sorted(direct_sizes.keys(), key=len, reverse=True): - parent = parents[path] - if parent and parent in total_sizes: # If not root and parent exists - total_sizes[parent] += total_sizes[path] - - # Ensure we have at least one root element - if not any(parent == "" for parent in parents.values()): - # Find the shortest path as root - if direct_sizes: - shortest_path = min(direct_sizes.keys(), key=len) - parents[shortest_path] = "" - - return total_sizes, parents - - def _get_dataframe(path_str, ssh_command, expid, cache): if ssh_command: find_cmd = f"find {shlex.quote(path_str)} -ls" @@ -461,12 +403,12 @@ def _get_dataframe(path_str, ssh_command, expid, cache): def main(): - global df + args = _get_parser().parse_args() - path_str = args.path + path_str = os.path.normpath(args.path) ssh_command = args.ssh_command expid = args.expid - root = args.root + root = os.path.normpath(args.root) if args.root is not None else path_str only_directories = args.only_directories figure = args.figure interactive = args.interactive @@ -474,6 +416,9 @@ def main(): output_dir = args.output df = None + if not args.interactive and not args.output: + raise ValueError("Interactive mode is required when output is not provided.") + cache_key_args = { "path": path_str, "expid": expid, @@ -484,6 +429,7 @@ def main(): df = _get_dataframe(path_str, ssh_command, expid, cache) _cache_write(cache, df, "df", **cache_key_args) + df["Parent"] = df["Parent"].apply(os.path.normpath) df = _add_path_column(df) df = _get_subtree_df(df, root)