summaryrefslogtreecommitdiff
path: root/bin/cleaner.py
blob: d8c999dafe7bb5d9bfc993066eccaef299af3d26 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#! venv/bin/python
import psycopg2
import psycopg2.extras
import pathlib
import os.path
import datetime
import pypandoc
from bs4 import BeautifulSoup
from datetime import timedelta, datetime, tzinfo
#from langdetect import detect
import sys
import re

conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1")
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

cursor.execute("select * from page_translations")
page_translations = cursor.fetchall()

fo = open("orig.txt", "w")
fm = open("modi.txt", "w")

for translation in page_translations:
    body = translation.get('body')
    cursor.execute("select * from pages where id = %s", [translation['page_id']])
    page = cursor.fetchone()

    if not body:
        continue

    if not '<p id=' in body:
        continue

    fo.write(body)

    print( 'Needing fixup: {}'.format(page['id']))
    soup = BeautifulSoup(body, 'html5lib')

    # https://stackoverflow.com/questions/10555932/how-do-i-use-beautifulsoup-to-replace-a-tag-with-its-contents
    # for match in soup.findAll('div'):
    #    match.replaceWithChildren()

    # https://stackoverflow.com/questions/33500888/how-to-remove-tags-that-have-no-content
    [x.decompose() for x in soup.findAll(lambda tag: (not tag.contents or len(tag.get_text(strip=True)) <= 0) and not tag.name == 'br' )]

    # for elm in soup.find_all('span', style=re.compile(r"text-decoration: underline;")):
    #    elm.name = 'u'
    #    del elm.attrs['style']

    # for elm in soup.find_all('span', style=re.compile(r"font-weight: bold;")):
    #    elm.name = 'b'
    #    del elm.attrs['style']

    # print(soup.prettify())

    for elm in soup.find_all('p', id=re.compile(r"magic")):
        del elm.attrs

    if soup.body and soup.body.children:
        fm.write("".join([str(x) for x in soup.body.children]))
        cursor.execute("update page_translations set body = %s where id = %s", ["".join([str(x) for x in soup.body.children]), translation['id']])
        conn.commit()