1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
#! venv/bin/python
import psycopg2
import psycopg2.extras
import pathlib
import os.path
import datetime
import pypandoc
from bs4 import BeautifulSoup
from datetime import timedelta, datetime, tzinfo
#from langdetect import detect
import sys
import re
conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1")
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute("select * from page_translations")
page_translations = cursor.fetchall()
fo = open("orig.txt", "w")
fm = open("modi.txt", "w")
for translation in page_translations:
body = translation.get('body')
cursor.execute("select * from pages where id = %s", [translation['page_id']])
page = cursor.fetchone()
if not body:
continue
if not '<p id=' in body:
continue
fo.write(body)
print( 'Needing fixup: {}'.format(page['id']))
soup = BeautifulSoup(body, 'html5lib')
# https://stackoverflow.com/questions/10555932/how-do-i-use-beautifulsoup-to-replace-a-tag-with-its-contents
# for match in soup.findAll('div'):
# match.replaceWithChildren()
# https://stackoverflow.com/questions/33500888/how-to-remove-tags-that-have-no-content
[x.decompose() for x in soup.findAll(lambda tag: (not tag.contents or len(tag.get_text(strip=True)) <= 0) and not tag.name == 'br' )]
# for elm in soup.find_all('span', style=re.compile(r"text-decoration: underline;")):
# elm.name = 'u'
# del elm.attrs['style']
# for elm in soup.find_all('span', style=re.compile(r"font-weight: bold;")):
# elm.name = 'b'
# del elm.attrs['style']
# print(soup.prettify())
for elm in soup.find_all('p', id=re.compile(r"magic")):
del elm.attrs
if soup.body and soup.body.children:
fm.write("".join([str(x) for x in soup.body.children]))
cursor.execute("update page_translations set body = %s where id = %s", ["".join([str(x) for x in soup.body.children]), translation['id']])
conn.commit()
|