зеркало из
https://github.com/docxology/cognitive.git
synced 2025-10-30 20:56:04 +02:00
316 строки
10 KiB
Python
316 строки
10 KiB
Python
import os
|
|
import json
|
|
import csv
|
|
import mimetypes
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set
|
|
from datetime import datetime
|
|
from tqdm import tqdm
|
|
import time
|
|
|
|
def extract_wikilinks(content: str) -> Set[str]:
|
|
"""Extract all [[wikilinks]] from content.
|
|
|
|
Args:
|
|
content (str): File content to analyze
|
|
|
|
Returns:
|
|
Set[str]: Set of unique wikilink targets
|
|
"""
|
|
# Match [[link]] or [[link|alias]] patterns
|
|
pattern = r'\[\[(.*?)(?:\|.*?)?\]\]'
|
|
matches = re.findall(pattern, content)
|
|
return set(matches)
|
|
|
|
def get_file_info(file_path: Path) -> Dict:
|
|
"""Get metadata and content info for a file.
|
|
|
|
Args:
|
|
file_path (Path): Path to the file
|
|
|
|
Returns:
|
|
Dict: File metadata including size, type, and wikilinks
|
|
"""
|
|
info = {
|
|
'name': file_path.name,
|
|
'extension': file_path.suffix,
|
|
'size': file_path.stat().st_size,
|
|
'mime_type': mimetypes.guess_type(file_path)[0],
|
|
'relative_path': str(file_path),
|
|
'modified_time': datetime.fromtimestamp(file_path.stat().st_mtime).isoformat(),
|
|
'wikilinks': set()
|
|
}
|
|
|
|
# Extract wikilinks from text files
|
|
if file_path.suffix in ['.md', '.txt']:
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
info['wikilinks'] = extract_wikilinks(content)
|
|
except Exception as e:
|
|
print(f"Error reading {file_path}: {e}")
|
|
|
|
return info
|
|
|
|
def list_files_in_directory(directory_path: str) -> Dict:
|
|
"""Lists all files in the given directory and prints their names, sizes, types, and complete file structure + paths.
|
|
|
|
This function recursively traverses a directory and collects information about each file, including:
|
|
- File name and extension
|
|
- File size in bytes
|
|
- File type/MIME type
|
|
- Full path relative to the root directory
|
|
- Any Obsidian-style [[wikilinks]] found in the file content
|
|
|
|
Args:
|
|
directory_path (str): Path to the directory to analyze
|
|
|
|
Returns:
|
|
dict: Dictionary containing the file structure and metadata
|
|
"""
|
|
root_path = Path(directory_path)
|
|
|
|
print("\n📂 Scanning directory structure...")
|
|
|
|
# Initialize structure
|
|
structure = {
|
|
'root_path': str(root_path),
|
|
'files': {},
|
|
'directories': {},
|
|
'wikilink_graph': {},
|
|
'stats': {
|
|
'total_files': 0,
|
|
'total_size': 0,
|
|
'file_types': {},
|
|
'wikilinks': set()
|
|
}
|
|
}
|
|
|
|
# Get total file count for progress bar
|
|
total_files = sum(1 for _ in root_path.rglob('*') if _.is_file())
|
|
|
|
# Recursively process directory
|
|
with tqdm(total=total_files, desc="Processing files", unit="file") as pbar:
|
|
for path in root_path.rglob('*'):
|
|
if path.is_file():
|
|
# Get relative path from root
|
|
rel_path = path.relative_to(root_path)
|
|
parent_dir = str(rel_path.parent)
|
|
|
|
# Get file info
|
|
file_info = get_file_info(path)
|
|
|
|
# Update structure
|
|
if parent_dir not in structure['directories']:
|
|
structure['directories'][parent_dir] = []
|
|
structure['directories'][parent_dir].append(file_info['name'])
|
|
|
|
# Store file info
|
|
structure['files'][str(rel_path)] = file_info
|
|
|
|
# Update stats
|
|
structure['stats']['total_files'] += 1
|
|
structure['stats']['total_size'] += file_info['size']
|
|
ext = file_info['extension']
|
|
structure['stats']['file_types'][ext] = structure['stats']['file_types'].get(ext, 0) + 1
|
|
structure['stats']['wikilinks'].update(file_info['wikilinks'])
|
|
|
|
# Update wikilink graph
|
|
if file_info['wikilinks']:
|
|
structure['wikilink_graph'][str(rel_path)] = list(file_info['wikilinks'])
|
|
|
|
pbar.update(1)
|
|
|
|
return structure
|
|
|
|
def export_json(structure: Dict, output_dir: Path) -> None:
|
|
"""Export the directory structure as JSON.
|
|
|
|
Args:
|
|
structure (Dict): Directory structure
|
|
output_dir (Path): Output directory path
|
|
"""
|
|
# Convert sets to lists for JSON serialization
|
|
json_structure = {
|
|
'root_path': structure['root_path'],
|
|
'files': {
|
|
k: {
|
|
**v,
|
|
'wikilinks': list(v['wikilinks'])
|
|
} for k, v in structure['files'].items()
|
|
},
|
|
'directories': structure['directories'],
|
|
'wikilink_graph': structure['wikilink_graph'],
|
|
'stats': {
|
|
**structure['stats'],
|
|
'wikilinks': list(structure['stats']['wikilinks'])
|
|
}
|
|
}
|
|
|
|
output_file = output_dir / 'file_structure.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(json_structure, f, indent=2)
|
|
|
|
def export_csv(structure: Dict, output_dir: Path) -> None:
|
|
"""Export file information as CSV.
|
|
|
|
Args:
|
|
structure (Dict): Directory structure
|
|
output_dir (Path): Output directory path
|
|
"""
|
|
# Export files data
|
|
files_output = output_dir / 'files.csv'
|
|
with open(files_output, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(['path', 'name', 'extension', 'size', 'mime_type', 'modified_time', 'wikilinks'])
|
|
for path, info in structure['files'].items():
|
|
writer.writerow([
|
|
path,
|
|
info['name'],
|
|
info['extension'],
|
|
info['size'],
|
|
info['mime_type'],
|
|
info['modified_time'],
|
|
'|'.join(info['wikilinks'])
|
|
])
|
|
|
|
# Export wikilink graph
|
|
links_output = output_dir / 'wikilinks.csv'
|
|
with open(links_output, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(['source', 'target'])
|
|
for source, targets in structure['wikilink_graph'].items():
|
|
for target in targets:
|
|
writer.writerow([source, target])
|
|
|
|
def export_txt(structure: Dict, output_dir: Path) -> None:
|
|
"""Export directory structure as formatted text.
|
|
|
|
Args:
|
|
structure (Dict): Directory structure
|
|
output_dir (Path): Output directory path
|
|
"""
|
|
output_file = output_dir / 'file_structure.txt'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
# Write header
|
|
f.write("=== Directory Structure Analysis ===\n\n")
|
|
|
|
# Write basic stats
|
|
f.write(f"Root path: {structure['root_path']}\n")
|
|
f.write(f"Total files: {structure['stats']['total_files']}\n")
|
|
f.write(f"Total size: {structure['stats']['total_size']} bytes\n\n")
|
|
|
|
# Write file types
|
|
f.write("File types:\n")
|
|
for ext, count in structure['stats']['file_types'].items():
|
|
f.write(f" {ext}: {count} files\n")
|
|
|
|
# Write directory structure
|
|
f.write("\nDirectory structure:\n")
|
|
for dir_path, files in structure['directories'].items():
|
|
f.write(f"\n{dir_path}/\n")
|
|
for file in files:
|
|
f.write(f" - {file}\n")
|
|
|
|
# Write wikilink relationships
|
|
f.write("\nWikilink relationships:\n")
|
|
for source, targets in structure['wikilink_graph'].items():
|
|
if targets:
|
|
f.write(f"\n{source} links to:\n")
|
|
for target in targets:
|
|
f.write(f" - [[{target}]]\n")
|
|
|
|
def export_structure(structure: Dict, output_dir: Path) -> None:
|
|
"""Export directory structure in multiple formats.
|
|
|
|
Args:
|
|
structure (Dict): Directory structure
|
|
output_dir (Path): Output directory path
|
|
"""
|
|
print("\n💾 Exporting directory structure...")
|
|
|
|
# Create output directory if it doesn't exist
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Export in different formats
|
|
with tqdm(total=3, desc="Generating outputs", unit="file") as pbar:
|
|
export_json(structure, output_dir)
|
|
pbar.update(1)
|
|
|
|
export_csv(structure, output_dir)
|
|
pbar.update(1)
|
|
|
|
export_txt(structure, output_dir)
|
|
pbar.update(1)
|
|
|
|
print(f"\n✨ Exported directory structure to {output_dir}:")
|
|
print(f" 📄 JSON: file_structure.json")
|
|
print(f" 📊 CSV: files.csv, wikilinks.csv")
|
|
print(f" 📝 Text: file_structure.txt")
|
|
|
|
def print_structure(structure: Dict) -> None:
|
|
"""Print a human-readable summary of the directory structure.
|
|
|
|
Args:
|
|
structure (Dict): Directory structure from list_files_in_directory
|
|
"""
|
|
print("\n📊 Directory Structure Analysis")
|
|
print("=" * 40 + "\n")
|
|
|
|
# Print basic stats
|
|
print(f"📂 Root path: {structure['root_path']}")
|
|
print(f"📁 Total files: {structure['stats']['total_files']}")
|
|
print(f"💾 Total size: {structure['stats']['total_size']:,} bytes")
|
|
|
|
# Print file types
|
|
print("\n📋 File types:")
|
|
for ext, count in structure['stats']['file_types'].items():
|
|
print(f" {ext or '(no extension)'}: {count} files")
|
|
|
|
# Print directory structure
|
|
print("\n🌳 Directory structure:")
|
|
for dir_path, files in structure['directories'].items():
|
|
print(f"\n{dir_path}/")
|
|
for file in files:
|
|
print(f" - {file}")
|
|
|
|
# Print wikilink relationships
|
|
print("\n🔗 Wikilink relationships:")
|
|
for source, targets in structure['wikilink_graph'].items():
|
|
if targets:
|
|
print(f"\n{source} links to:")
|
|
for target in targets:
|
|
print(f" - [[{target}]]")
|
|
|
|
print("\n" + "=" * 40)
|
|
|
|
def main():
|
|
"""Main function to run the directory analysis."""
|
|
start_time = time.time()
|
|
|
|
# Get the script's directory
|
|
script_dir = Path(__file__).parent
|
|
|
|
# Analyze the parent directory (knowledge base root)
|
|
kb_dir = script_dir.parent
|
|
|
|
print("\n🔍 Knowledge Base Directory Analysis")
|
|
print("=" * 40)
|
|
|
|
print(f"\n📂 Analyzing directory: {kb_dir}")
|
|
structure = list_files_in_directory(kb_dir)
|
|
|
|
# Create output directory
|
|
output_dir = script_dir / 'output'
|
|
export_structure(structure, output_dir)
|
|
|
|
# Also print to console
|
|
print_structure(structure)
|
|
|
|
end_time = time.time()
|
|
duration = end_time - start_time
|
|
print(f"\n✨ Analysis completed in {duration:.2f}s\n")
|
|
|
|
if __name__ == "__main__":
|
|
main() |