diff options
Diffstat (limited to 'bin/cleaner.py')
| -rwxr-xr-x | bin/cleaner.py | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/bin/cleaner.py b/bin/cleaner.py new file mode 100755 index 00000000..d8c999da --- /dev/null +++ b/bin/cleaner.py | |||
| @@ -0,0 +1,62 @@ | |||
| 1 | #! venv/bin/python | ||
| 2 | import psycopg2 | ||
| 3 | import psycopg2.extras | ||
| 4 | import pathlib | ||
| 5 | import os.path | ||
| 6 | import datetime | ||
| 7 | import pypandoc | ||
| 8 | from bs4 import BeautifulSoup | ||
| 9 | from datetime import timedelta, datetime, tzinfo | ||
| 10 | #from langdetect import detect | ||
| 11 | import sys | ||
| 12 | import re | ||
| 13 | |||
| 14 | conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1") | ||
| 15 | cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) | ||
| 16 | |||
| 17 | cursor.execute("select * from page_translations") | ||
| 18 | page_translations = cursor.fetchall() | ||
| 19 | |||
| 20 | fo = open("orig.txt", "w") | ||
| 21 | fm = open("modi.txt", "w") | ||
| 22 | |||
| 23 | for translation in page_translations: | ||
| 24 | body = translation.get('body') | ||
| 25 | cursor.execute("select * from pages where id = %s", [translation['page_id']]) | ||
| 26 | page = cursor.fetchone() | ||
| 27 | |||
| 28 | if not body: | ||
| 29 | continue | ||
| 30 | |||
| 31 | if not '<p id=' in body: | ||
| 32 | continue | ||
| 33 | |||
| 34 | fo.write(body) | ||
| 35 | |||
| 36 | print( 'Needing fixup: {}'.format(page['id'])) | ||
| 37 | soup = BeautifulSoup(body, 'html5lib') | ||
| 38 | |||
| 39 | # https://stackoverflow.com/questions/10555932/how-do-i-use-beautifulsoup-to-replace-a-tag-with-its-contents | ||
| 40 | # for match in soup.findAll('div'): | ||
| 41 | # match.replaceWithChildren() | ||
| 42 | |||
| 43 | # https://stackoverflow.com/questions/33500888/how-to-remove-tags-that-have-no-content | ||
| 44 | [x.decompose() for x in soup.findAll(lambda tag: (not tag.contents or len(tag.get_text(strip=True)) <= 0) and not tag.name == 'br' )] | ||
| 45 | |||
| 46 | # for elm in soup.find_all('span', style=re.compile(r"text-decoration: underline;")): | ||
| 47 | # elm.name = 'u' | ||
| 48 | # del elm.attrs['style'] | ||
| 49 | |||
| 50 | # for elm in soup.find_all('span', style=re.compile(r"font-weight: bold;")): | ||
| 51 | # elm.name = 'b' | ||
| 52 | # del elm.attrs['style'] | ||
| 53 | |||
| 54 | # print(soup.prettify()) | ||
| 55 | |||
| 56 | for elm in soup.find_all('p', id=re.compile(r"magic")): | ||
| 57 | del elm.attrs | ||
| 58 | |||
| 59 | if soup.body and soup.body.children: | ||
| 60 | fm.write("".join([str(x) for x in soup.body.children])) | ||
| 61 | cursor.execute("update page_translations set body = %s where id = %s", ["".join([str(x) for x in soup.body.children]), translation['id']]) | ||
| 62 | conn.commit() | ||
