cms/pageident.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210

# -*- coding: utf-8 -*-
#
#   cms.py - simple WSGI/Python based CMS script
#
#   Copyright (C) 2011-2019 Michael Buesch <m@bues.ch>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 2 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.

#from cms.cython_support cimport * #@cy

from cms.exception import *
from cms.util import * #+cimport

import re
import os

__all__ = [
	"CMSPageIdent",
]

class CMSPageIdent(object):
	# Page identifier.

	__slots__ = (
		# Path components.
		# List of str.
		"__elements",

		# Boolean.
		# True, if all __elements have been validated.
		"__allValidated",
	)

	__pageFileName_re	= re.compile(
		r'^(.*)((?:\.html?)|(?:\.py)|(?:\.php))$', re.DOTALL)
	__indexPages		= {"", "index"}

	# Parse a page identifier from a string.
	# That string may contain malicious components such as backwards
	# traversals (".." in the file path). This class takes care to reject
	# such page identifiers before use as filesystem path.
	@classmethod
	def parse(cls, path, maxPathLen=512, maxIdentDepth=32):
		if len(path) > maxPathLen:
			raise CMSException(400, "Invalid URL")

		pageIdent = cls()

		# Strip whitespace and slashes
		path = path.strip(' \t/')

		# Remove page file extensions like .html and such.
		m = cls.__pageFileName_re.match(path)
		if m:
			path = m.group(1)

		# Use the ident elements, if this is not the root page.
		if path not in cls.__indexPages:
			pageIdent.extend(path.split("/"))

		if len(pageIdent.__elements) > maxIdentDepth:
			raise CMSException(400, "Invalid URL")

		return pageIdent

	__pathSep = os.path.sep
	__validPathChars = LOWERCASE + UPPERCASE + NUMBERS + "-_."

	# Validate a path component. Avoid any directory change.
	# Raises CMSException on failure.
	@classmethod
	def validateSafePathComponent(cls, pcomp):
		if pcomp.startswith('.'):
			# No ".", ".." and hidden files.
			raise CMSException(404, "Invalid page path")
		if [ c for c in pcomp if c not in cls.__validPathChars ]:
			raise CMSException(404, "Invalid page path")
		return pcomp

	# Validate a path. Avoid going back in the hierarchy (. and ..)
	# Raises CMSException on failure.
	@classmethod
	def validateSafePath(cls, path):
		for pcomp in path.split(cls.__pathSep):
			cls.validateSafePathComponent(pcomp)
		return path

	# Validate a page name.
	# Raises CMSException on failure.
	# If allowSysNames is True, system names starting with "__" are allowed.
	@classmethod
	def validateName(cls, name, allowSysNames=False):
		if name.startswith("__") and not allowSysNames:
			# Page names with __ are system folders.
			raise CMSException(404, "Invalid page name")
		return cls.validateSafePathComponent(name)

	# Initialize this page identifier.
	def __init__(self, initialElements=None):
		self.__elements = []
		self.extend(initialElements)
		self.__allValidated = False

	# Add a list of path elements to this identifier.
	def extend(self, other):
		if other is not None:
			self.__allValidated = False

			if isinstance(other, self.__class__):
				self.__elements.extend(other.__elements)
			elif isiterable(other):
				self.__elements.extend(other)
			else:
				raise CMSException(500, "Invalid 'other' in CMSPageIdent.extend()")
		return self

	# Add a list of path elements to this identifier.
	def __iadd__(self, other):
		return self.extend(other)

	# Create a new page identifier from 'self' and add 'other'.
	def __add__(self, other):
		return self.__class__(self).extend(other)

	# Get the number of path components in this path identifier.
	def __len__(self):
		return len(self.__elements)

	# Validate all page identifier name components.
	# (Do not allow system name components)
	def __validateAll(self):
		if not self.__allValidated:
			for pcomp in self.__elements:
				self.validateName(pcomp)
			# Remember that we validated.
			# (This flag must be reset to false, if components are added.)
			self.__allValidated = True

	# Get one page identifier component by index.
	def get(self, index, default=None, allowSysNames=False):
		try:
			return self.validateName(self.__elements[index],
						 allowSysNames)
		except IndexError:
			return default

	# Get the page identifier as URL.
	def getUrl(self, protocol=None, domain=None,
		   urlBase=None, pageSuffix=".html"):
		self.__validateAll()
		url = []
		if protocol:
			url.append(protocol + ":/")
		if domain:
			url.append(domain)
		if urlBase:
			url.append(urlBase.strip("/"))
		localPath = [elem for elem in self.__elements if elem]
		url.extend(localPath)
		if not protocol and not domain:
			url.insert(0, "")
		urlStr = "/".join(url)
		if localPath and pageSuffix:
			urlStr += pageSuffix
		return urlStr

	# Get the page identifier as filesystem path.
	def getFilesystemPath(self, rstrip=0):
		self.__validateAll()
		if self.__elements:
			if rstrip > 0:
				pcomps = self.__elements[ : 0 - rstrip]
				if pcomps:
					return fs.mkpath(*pcomps)
				return ""
			return fs.mkpath(*(self.__elements))
		return ""

	# Test if this identifier starts with the same elements
	# as another one.
	def startswith(self, other):
		return other is not None and\
		       len(self.__elements) >= len(other.__elements) and\
		       self.__elements[ : len(other.__elements)] == other.__elements

	def __hash__(self):
#@cy		cdef Py_ssize_t h
#@cy		cdef list elements
#@cy		cdef str element

		h = 0
		elements = self.__elements
		for element in elements:
			h ^= hash(element)
		return h

	def __eq__(self, other):
		return (isinstance(other, self.__class__) and
			self.__elements == other.__elements)