-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_docx_to_txt.py
More file actions
89 lines (66 loc) · 2.48 KB
/
convert_docx_to_txt.py
File metadata and controls
89 lines (66 loc) · 2.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
Convert all .docx files in a directory and its subdirectories to .md (Markdown) files.
Preserves the original directory structure and creates .md files alongside the .docx files.
"""
import os
from pathlib import Path
from docx2md.convert import do_convert
def convert_docx_to_md(docx_path, md_path):
"""
Convert a single .docx file to .md (Markdown) format.
Args:
docx_path: Path to the source .docx file
md_path: Path where the .md file will be saved
"""
try:
# Convert .docx to markdown
markdown_content = do_convert(str(docx_path))
# Write to .md file
with open(md_path, 'w', encoding='utf-8') as md_file:
md_file.write(markdown_content)
print(f"✓ Converted: {docx_path.name}")
return True
except Exception as e:
print(f"✗ Error converting {docx_path.name}: {str(e)}")
return False
def convert_all_docx_in_directory(root_dir):
"""
Recursively find and convert all .docx files in a directory.
Args:
root_dir: Root directory to start searching from
"""
root_path = Path(root_dir)
if not root_path.exists():
print(f"Error: Directory '{root_dir}' does not exist")
return
# Find all .docx files recursively
docx_files = list(root_path.rglob("*.docx"))
if not docx_files:
print(f"No .docx files found in '{root_dir}'")
return
print(f"Found {len(docx_files)} .docx files to convert...\n")
successful = 0
failed = 0
for docx_file in docx_files:
# Skip temporary Word files (starting with ~$)
if docx_file.name.startswith('~$'):
continue
# Create .md filename in the same directory
md_file = docx_file.with_suffix('.md')
# Convert the file
if convert_docx_to_md(docx_file, md_file):
successful += 1
else:
failed += 1
print(f"\n{'='*50}")
print(f"Conversion complete!")
print(f"Successfully converted: {successful}")
print(f"Failed: {failed}")
print(f"{'='*50}")
if __name__ == "__main__":
# Set the directory to convert
# Change this to the path of your documentation folder
directory_to_convert = "cisco-academy-documentation"
print(f"Starting conversion of .docx files to Markdown in '{directory_to_convert}'...")
print(f"{'='*50}\n")
convert_all_docx_in_directory(directory_to_convert)