aboutsummaryrefslogtreecommitdiffstats
path: root/cms/sitemap.py
blob: 19179217d7b6c78828a2056435687f2c31c509b0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# -*- coding: utf-8 -*-
#
#   cms.py - simple WSGI/Python based CMS script
#
#   Copyright (C) 2021 Michael Buesch <m@bues.ch>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.

#from cms.cython_support cimport * #@cy

from cms.db import *
from cms.exception import *
from cms.pageident import *
#from cms.util import * #+cimport

from xml.sax import saxutils

__all__ = [
	"CMSSiteMap",
]

class CMSSiteMap(object):
	"""Site map generator.
	Specification: https://www.sitemaps.org/protocol.html
	"""

	BASE_INDENT	= 1
	INDENT		= "  "
	MORE_ESCAPES	= {
		"'" : "&apos;",
		'"' : "&quot;",
	}

	def __init__(self, db, domain, urlBase):
		self.__db = db
		self.__domain = domain
		self.__urlBase = urlBase

	@classmethod
	def __xmlQuote(cls, string):
		return saxutils.escape(string, cls.MORE_ESCAPES)

	@classmethod
	def __oneElem(cls, ind, url, lastmod=None, changefreq=None, prio=None):
		ret = [ f'{ind}<url>' ]
		url = cls.__xmlQuote(url)
		ret.append(f'{ind}{cls.INDENT}<loc>{url}</loc>')
		if lastmod:
			lastmod = cls.__xmlQuote(lastmod)
			ret.append(f'{ind}{cls.INDENT}<lastmod>{lastmod}</lastmod>')
		if changefreq:
			changefreq = cls.__xmlQuote(changefreq)
			ret.append(f'{ind}{cls.INDENT}<changefreq>{changefreq}</changefreq>')
		if prio:
			prio = cls.__xmlQuote(prio)
			ret.append(f'{ind}{cls.INDENT}<priority>{prio}</priority>')
		ret.append(f'{ind}</url>')
		return ret

	def __getUrlElems(self, pageIdent, protocol, indent=BASE_INDENT):
		if self.__db.getNavStop(pageIdent):
			return

		ind = self.INDENT * indent
		if indent <= self.BASE_INDENT + 1:
			pageSuffix = "/" # Groups.
		else:
			pageSuffix = ".html" # Pages and sub groups.
		url = pageIdent.getUrl(protocol=protocol,
				       domain=self.__domain,
				       urlBase=self.__urlBase,
				       pageSuffix=pageSuffix)
		if indent == self.BASE_INDENT + 1:
			# Main groups.
			lastmod = None
			changefreq = "monthly"
			prio = "0.3"
		else:
			# Pages, main page and sub groups.
			lastmod = self.__db.getPageStamp(pageIdent).strftime("%Y-%m-%dT%H:%M:%SZ")
			changefreq = None
			prio = "0.7"
		yield self.__oneElem(ind=ind,
				     url=url,
				     lastmod=lastmod,
				     changefreq=changefreq,
				     prio=prio)

		subPages = self.__db.getSubPages(pageIdent)
		if subPages:
			for pagename, pagelabel, pageprio in subPages:
				subPageIdent = CMSPageIdent(pageIdent + [pagename])
				yield from self.__getUrlElems(subPageIdent,
							      protocol,
							      indent + 1)

	def __getUserUrlElems(self, protocol):
		userSiteMap = self.__db.getString("site-map")
		if not userSiteMap:
			return
		for line in userSiteMap.splitlines():
			line = line.strip()
			if not line or line.startswith("#"):
				continue
			lineItems = line.split()
			if len(lineItems) == 1:
				url, prio, changefreq = lineItems[0], "0.7", "always"
			elif len(lineItems) == 2:
				url, prio, changefreq = lineItems[0], lineItems[1], "always"
			elif len(lineItems) == 3:
				url, prio, changefreq = lineItems[0], lineItems[1], lineItems[2]
			else:
				raise CMSException(500, "site-map: Invalid line format.")
			try:
				float(prio)
			except Exception:
				raise CMSException(500, "site-map: Invalid priority value.")
			if changefreq not in ("always", "hourly", "daily", "weekly",
					      "monthly", "yearly", "never",):
				raise CMSException(500, "site-map: Invalid changefreq value.")
			url = f'{protocol}://{self.__domain}/{url}'
			yield self.__oneElem(ind=self.INDENT,
					     url=url,
					     changefreq=changefreq,
					     prio=prio)

	def getSiteMap(self, rootPageIdent, protocol):
		ret = [ '<?xml version="1.0" encoding="UTF-8"?>' ]
		ret.append('<urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9" '
			   'xmlns:xsi="https://www.w3.org/2001/XMLSchema-instance" '
			   'xsi:schemaLocation="https://www.sitemaps.org/schemas/sitemap/0.9 '
			   'https://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">')
		for urlElemLines in self.__getUrlElems(rootPageIdent, protocol):
			ret.extend(urlElemLines)
		for urlElemLines in self.__getUserUrlElems(protocol):
			ret.extend(urlElemLines)
		ret.append('</urlset>')
		return "\n".join(ret)
bues.ch cgit interface