Any improvement would be appreciated.
import re
def remove_comments(file_path):
with open(file_path, 'r') as file:
content = file.read()
# First, find and store string assignments
protected_strings = {}
counter = 0
def protect_string_assignments(match):
nonlocal counter
var_name, string_content = match.groups()
key = f'PROTECTED_STRING_{counter}'
protected_strings[key] = match.group(0)
counter += 1
return key
# Protect strings that are part of assignments
protected_content = re.sub(
r'([a-zA-Z_][a-zA-Z0-9_]*\s*=\s*)("""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\')',
protect_string_assignments,
content
)
# Remove docstring comments (triple-quoted strings not part of assignments)
cleaned_content = re.sub(
r'"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\'',
'',
protected_content
)
# Remove single-line comments and empty lines
lines = []
for line in cleaned_content.split('\n'):
# Remove inline comments
line = re.sub(r'#.*$', '', line)
if line.strip():
lines.append(line)
# Restore protected strings
final_content = '\n'.join(lines)
for key, value in protected_strings.items():
final_content = final_content.replace(key, value)
# Write back to file
with open(file_path, 'w') as file:
file.write(final_content)
# Example usage:
remove_comments('your_script.py')