summaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
Diffstat (limited to 'bin')
-rw-r--r--bin/Makefile18
-rwxr-xr-xbin/cleaner.py62
-rwxr-xr-xbin/export.py158
3 files changed, 238 insertions, 0 deletions
diff --git a/bin/Makefile b/bin/Makefile
new file mode 100644
index 00000000..10ac0d1e
--- /dev/null
+++ b/bin/Makefile
@@ -0,0 +1,18 @@
1all: rebuild nikola build
2
3rebuild:
4 rm -rf git dump
5 ./export.py
6 rm git/pages/updates.md
7
8nikola:
9 venv/bin/nikola init -q git
10 cp site/conf.py git
11 cd git; nikola plugin -i localsearch
12 cd git; nikola theme -n ccc --parent=bootstrap4
13 cp -r site/themes/ccc git/themes/
14 cp -r site/data site/shortcodes git/
15 cp -r site/index.rst site/index.en.rst site/search.html site/cpu site/themen git/pages/
16
17build:
18 cd git; nikola build --backend=sqlite3
diff --git a/bin/cleaner.py b/bin/cleaner.py
new file mode 100755
index 00000000..d8c999da
--- /dev/null
+++ b/bin/cleaner.py
@@ -0,0 +1,62 @@
1#! venv/bin/python
2import psycopg2
3import psycopg2.extras
4import pathlib
5import os.path
6import datetime
7import pypandoc
8from bs4 import BeautifulSoup
9from datetime import timedelta, datetime, tzinfo
10#from langdetect import detect
11import sys
12import re
13
14conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1")
15cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
16
17cursor.execute("select * from page_translations")
18page_translations = cursor.fetchall()
19
20fo = open("orig.txt", "w")
21fm = open("modi.txt", "w")
22
23for translation in page_translations:
24 body = translation.get('body')
25 cursor.execute("select * from pages where id = %s", [translation['page_id']])
26 page = cursor.fetchone()
27
28 if not body:
29 continue
30
31 if not '<p id=' in body:
32 continue
33
34 fo.write(body)
35
36 print( 'Needing fixup: {}'.format(page['id']))
37 soup = BeautifulSoup(body, 'html5lib')
38
39 # https://stackoverflow.com/questions/10555932/how-do-i-use-beautifulsoup-to-replace-a-tag-with-its-contents
40 # for match in soup.findAll('div'):
41 # match.replaceWithChildren()
42
43 # https://stackoverflow.com/questions/33500888/how-to-remove-tags-that-have-no-content
44 [x.decompose() for x in soup.findAll(lambda tag: (not tag.contents or len(tag.get_text(strip=True)) <= 0) and not tag.name == 'br' )]
45
46 # for elm in soup.find_all('span', style=re.compile(r"text-decoration: underline;")):
47 # elm.name = 'u'
48 # del elm.attrs['style']
49
50 # for elm in soup.find_all('span', style=re.compile(r"font-weight: bold;")):
51 # elm.name = 'b'
52 # del elm.attrs['style']
53
54 # print(soup.prettify())
55
56 for elm in soup.find_all('p', id=re.compile(r"magic")):
57 del elm.attrs
58
59 if soup.body and soup.body.children:
60 fm.write("".join([str(x) for x in soup.body.children]))
61 cursor.execute("update page_translations set body = %s where id = %s", ["".join([str(x) for x in soup.body.children]), translation['id']])
62 conn.commit()
diff --git a/bin/export.py b/bin/export.py
new file mode 100755
index 00000000..26460e8f
--- /dev/null
+++ b/bin/export.py
@@ -0,0 +1,158 @@
1#! venv/bin/python
2import psycopg2
3import psycopg2.extras
4import pathlib
5import os.path
6import datetime
7import pypandoc
8import json
9from datetime import timedelta, datetime, tzinfo
10#from langdetect import detect
11import sys
12
13conn = psycopg2.connect(database="cccms_dev", user="postgres", password="", host="127.0.0.1")
14cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
15lcursor = conn.cursor()
16cursor.execute("select id, login, email, admin from users", [])
17user = cursor.fetchall()
18
19page_list = []
20redirect_list = {}
21
22pathlib.Path(os.path.dirname('git/')).mkdir(parents=True, exist_ok=True)
23os.system("git -C git/ init")
24pathlib.Path(os.path.dirname('git/images/')).mkdir(parents=True, exist_ok=True)
25pathlib.Path(os.path.dirname('git/files/')).mkdir(parents=True, exist_ok=True)
26pathlib.Path(os.path.dirname('dump/images/')).mkdir(parents=True, exist_ok=True)
27pathlib.Path(os.path.dirname('dump/files/')).mkdir(parents=True, exist_ok=True)
28
29rev = 0
30cursor.execute("select id, name, upload_content_type, upload_file_name, upload_updated_at from assets order by id", [])
31assets = cursor.fetchall()
32for asset in assets:
33 if not asset.get('upload_file_name'):
34 continue
35 source = "/usr/local/www/cccms/public/system/uploads/{}/original/{}".format(asset['id'], asset['upload_file_name'])
36 destdir = 'files/'
37 if asset['upload_content_type'].startswith('image'):
38 destdir = 'images/'
39 page_list.append({ 'date': asset['upload_updated_at'], 'revision': rev, 'comment': "asset {}".format(asset['name']), 'fname': source, 'gname': destdir+asset['upload_file_name'], 'editor': 'admin', 'email': 'admin@cccms.de'})
40 rev = rev + 1
41 redirect_list['/public/system/uploads/{}/original/{}'.format(asset['id'], asset['upload_file_name'])] = destdir+asset['upload_file_name']
42
43with open('redirects.json', 'w') as outfile:
44 json.dump(redirect_list, outfile)
45
46cursor.execute("select id, unique_name, created_at from nodes order by id", [])
47nodes = cursor.fetchall()
48
49for node in nodes:
50 if not 'unique_name' in node:
51 print ("WARNING: NO unique_name in node " + str(node.id))
52 continue
53
54 if node['id'] == 1:
55 continue
56
57 if "/" in str(node['unique_name']):
58 pathlib.Path(os.path.dirname('dump/'+node['unique_name'])).mkdir(parents=True, exist_ok=True)
59 if str(node['unique_name']).startswith('updates/'):
60 pathlib.Path(os.path.dirname('git/'+node['unique_name'])).mkdir(parents=True, exist_ok=True)
61 else:
62 pathlib.Path(os.path.dirname('git/pages/'+node['unique_name'])).mkdir(parents=True, exist_ok=True)
63
64 cursor.execute("select id, revision, created_at, updated_at, published_at, user_id, editor_id from pages where node_id = %s", [node['id']])
65 pages = cursor.fetchall()
66
67 # print ( "WORKING ON NODE: " + str(node['id']))
68
69 for page in pages:
70
71 # ignoring unpublished pages
72 if not page['published_at']:
73 # print("UNPUBLISHED: {}\n".format(node['id']))
74 continue
75
76 cursor.execute("select name from tags where id in (select tag_id from taggings where taggable_id = %s)", [page['id']])
77 tags = cursor.fetchall()
78
79 editor = "admin"
80 if page['editor_id']:
81 cursor.execute("select login from users where id = %s", [page['editor_id']])
82 e = cursor.fetchall()
83 if len(e):
84 editor = e[0].get("login", "NO-LOGIN")
85
86 creator = "admin"
87 if page['user_id']:
88 cursor.execute("select login from users where id = %s", [page['user_id']])
89 c = cursor.fetchall()
90 if len(c):
91 creator = c[0].get("login", "NO-LOGIN")
92
93 previewimage = ''
94 cursor.execute("select upload_file_name from related_assets full join assets on related_assets.asset_id = assets.id where related_assets.page_id = %s and position = 1;", [page['id']])
95 related_asset = cursor.fetchall()
96 if len(related_asset):
97 previewimage = related_asset[0].get('upload_file_name')
98
99 cursor.execute("select locale, title, abstract, body, created_at, updated_at from page_translations where page_id = %s", [page['id']])
100 page_translations = cursor.fetchall()
101 for translation in page_translations:
102 if not translation.get("title"):
103 continue
104
105 if len(tags) and not isinstance(tags[0], str):
106 tags = [y for x in tags for y in x]
107
108 if translation['locale'] == 'en':
109 fname = "dump/" + node['unique_name'] + '.en.md:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp()))
110 gitname = node['unique_name'] + '.en.md'
111 else:
112 fname = "dump/" + node['unique_name'] + '.md:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp()))
113 gitname = node['unique_name'] + '.md'
114
115 if not gitname.startswith('updates/'):
116 gitname = 'pages/' + gitname
117
118 #lang = detect(translation.get('body'))
119 #print('{}:{} node_id {: 5d} revision {: 3d} with page_id {: 5d}, path {}'.format(lang, translation['locale'], node['id'], page['revision'], page['id'], node['unique_name']))
120
121 # md = pypandoc.convert_text(str(translation.get('body', "")), 'md', format='html', extra_args=['--atx-headers'])
122 # rst = pypandoc.convert_text(str(translation.get('body', "")), 'rst', format='html', extra_args=['--atx-headers'])
123 # with open( "dump/" + node['unique_name'] + '.' + translation['locale'] + '.md.html:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())), "w") as f:
124 # f.write(pypandoc.convert_text( md, 'html', format='md', extra_args=['--atx-headers']))
125 # with open( "dump/" + node['unique_name'] + '.' + translation['locale'] + '.rst.html:' + str(page['revision']) + '@' + str(int(page['updated_at'].timestamp())), "w") as f:
126 # f.write(pypandoc.convert_text( rst, 'html', format='rst', extra_args=['--atx-headers']))
127
128 with open(fname, "w") as f:
129 f.write("title: {}\n".format(' '.join(translation.get("title", "").split())))
130 f.write("date: {}\n".format(page['published_at'].strftime("%Y-%m-%d %H:%M:%S %z")))
131 if page.get('updated_at'):
132 f.write("updated: {}\n".format(page['updated_at'].strftime("%Y-%m-%d %H:%M:%S %z")))
133 f.write("author: {}\n".format(creator))
134 f.write("tags: {}\n".format(', '.join(tags).lower()))
135 if previewimage:
136 f.write("previewimage: /images/{}\n".format(previewimage))
137 f.write("\n")
138
139 # Add abstract, if one is there
140 if translation.get('abstract'):
141 f.write(str(translation.get('abstract', "")))
142 f.write("\n\n")
143 if translation.get('body'):
144 f.write("<!-- TEASER_END -->\n\n")
145
146 f.write(pypandoc.convert_text( str(translation.get('body', "")), 'markdown-smart', format='html-native_divs-native_spans', extra_args=['--atx-headers']))
147 userrec = next(filter(lambda person: person['login'] == editor, user))
148 page_list.append({ 'date': page['updated_at'], 'revision': page['revision'], 'comment': "page revision {}".format(page['revision']), 'fname': fname, 'gname': gitname, 'editor': editor, 'email': userrec['email']})
149
150page_list.sort(key=lambda tup: (tup['date'], tup['revision']))
151for page in page_list:
152 print(page)
153 os.system("cp {} git/{}".format(page['fname'], page['gname']))
154 os.system("git -C git/ add {}".format(page['gname']))
155 os.environ['GIT_COMMITTER_NAME'] = page['editor']
156 os.environ['GIT_COMMITTER_EMAIL'] = page['email']
157 os.system('git -C git/ commit -m "committing {}" --author="{} <{}>" --date="{}"'.format(page['comment'], page['editor'], page['email'], page['date']))
158