summaryrefslogtreecommitdiff
path: root/bin/cleaner.py
diff options
context:
space:
mode:
Diffstat (limited to 'bin/cleaner.py')
-rwxr-xr-xbin/cleaner.py62
1 files changed, 62 insertions, 0 deletions
diff --git a/bin/cleaner.py b/bin/cleaner.py
new file mode 100755
index 00000000..d8c999da
--- /dev/null
+++ b/bin/cleaner.py
@@ -0,0 +1,62 @@
1#! venv/bin/python
2import psycopg2
3import psycopg2.extras
4import pathlib
5import os.path
6import datetime
7import pypandoc
8from bs4 import BeautifulSoup
9from datetime import timedelta, datetime, tzinfo
10#from langdetect import detect
11import sys
12import re
13
14conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1")
15cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
16
17cursor.execute("select * from page_translations")
18page_translations = cursor.fetchall()
19
20fo = open("orig.txt", "w")
21fm = open("modi.txt", "w")
22
23for translation in page_translations:
24 body = translation.get('body')
25 cursor.execute("select * from pages where id = %s", [translation['page_id']])
26 page = cursor.fetchone()
27
28 if not body:
29 continue
30
31 if not '<p id=' in body:
32 continue
33
34 fo.write(body)
35
36 print( 'Needing fixup: {}'.format(page['id']))
37 soup = BeautifulSoup(body, 'html5lib')
38
39 # https://stackoverflow.com/questions/10555932/how-do-i-use-beautifulsoup-to-replace-a-tag-with-its-contents
40 # for match in soup.findAll('div'):
41 # match.replaceWithChildren()
42
43 # https://stackoverflow.com/questions/33500888/how-to-remove-tags-that-have-no-content
44 [x.decompose() for x in soup.findAll(lambda tag: (not tag.contents or len(tag.get_text(strip=True)) <= 0) and not tag.name == 'br' )]
45
46 # for elm in soup.find_all('span', style=re.compile(r"text-decoration: underline;")):
47 # elm.name = 'u'
48 # del elm.attrs['style']
49
50 # for elm in soup.find_all('span', style=re.compile(r"font-weight: bold;")):
51 # elm.name = 'b'
52 # del elm.attrs['style']
53
54 # print(soup.prettify())
55
56 for elm in soup.find_all('p', id=re.compile(r"magic")):
57 del elm.attrs
58
59 if soup.body and soup.body.children:
60 fm.write("".join([str(x) for x in soup.body.children]))
61 cursor.execute("update page_translations set body = %s where id = %s", ["".join([str(x) for x in soup.body.children]), translation['id']])
62 conn.commit()