So I fixed the program for title issues, and I could share it, but it seems that if a note-attributes tag exists with content, that each attribute in the content should be procesed with html.escape so that the later unescape doesn't destroy it.
I'm not familiar with ElementTree, so I'm not sure how to write a loop over the note-attributes content.
Another issue is that sometimes there is text before the first div so shouldn't that text be used as the title? If nothing there, then also sometimes the first div is empty... Perhaps it would be nice to check for text before the first div, and then iterate over the divs until finding something non-blank?
# max chars in title
titleLength = 40
import re
import html
import os
import xml.etree.ElementTree as ET
# define a function that takes a string as an argument and strips all html fields
def strip_html_fields(new_title):
# use re.sub to replace any html tags with an empty string
# the pattern is < followed by any characters until >, with the flags re.IGNORECASE and re.DOTALL
# the replacement is an empty string
# the string is new_title
return re.sub("<.*?>", "", new_title, flags=re.IGNORECASE | re.DOTALL)
# Define a function that takes a file name as an argument to process that file
def process_file(file_name):
print("##################")
print(file_name)
# parse input.enex file
tree = ET.parse(file_name)
root = tree.getroot()
# Loop through all the notes in the tree
for note in root.findall("note"):
# Get the title and the content of the note
title = note.find("title").text
content = note.find("content").text
# Check if the title is "Untitled Note"
if title == "Untitled Note":
# Find the first div in the content
start = content.find("<div>")
end = content.find("</div>")
# Extract the text between the div tags
new_title = content[start + 5 : end]
new_title = strip_html_fields(new_title)
new_title = new_title[:titleLength]
new_title = new_title.strip()
else:
new_title = html.escape( title )
if new_title:
# Replace the title with the new title
note.find("title").text = new_title
print(new_title)
note.find("content").text = '<![CDATA[' + content + ']]>'
# write the modified tree to output.enex file
tree.write(file_name)
# open the .enex file in binary mode to convert HTML character codes into text equivalents
with open(file_name, "rb") as f:
# read the file content as bytes
data = f.read()
# decode the bytes using utf-8 encoding
text = data.decode("utf-8")
# unescape the HTML character codes using the html.unescape function
text = html.unescape(text)
# open the .enex file in binary mode to write the output
with open(file_name, "wb") as f:
# encode the text using utf-8 encoding
data = text.encode("utf-8")
# write the data to the file using the file object's write method
f.write(data)
# Get the current directory
current_dir = os.getcwd()
# Loop through all the files in the current directory
for file in os.listdir(current_dir):
# Check if the file has the .enex extension
if file.endswith(".enex"):
# Apply the function to the file name
process_file(file)