1 # Copyright 2013 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
7 from cStringIO
import StringIO
10 from zipfile
import ZipFile
12 import appengine_blobstore
as blobstore
13 from appengine_url_fetcher
import AppEngineUrlFetcher
14 from appengine_wrappers
import urlfetch
15 from docs_server_utils
import StringIdentity
16 from file_system
import FileNotFoundError
, FileSystem
, FileSystemError
, StatInfo
17 from future
import Future
18 from object_store_creator
import ObjectStoreCreator
19 from path_util
import AssertIsDirectory
, IsDirectory
23 _GITHUB_REPOS_NAMESPACE
= 'GithubRepos'
26 def _LoadCredentials(object_store_creator
):
27 '''Returns (username, password) from |password_store|.
29 password_store
= object_store_creator
.Create(
34 password_data
= password_store
.GetMulti(('username', 'password')).Get()
35 return password_data
.get('username'), password_data
.get('password')
38 class _GithubZipFile(object):
39 '''A view of a ZipFile with a more convenient interface which ignores the
40 'zipball' prefix that all paths have. The zip files that come straight from
41 GitHub have paths like ['zipball/foo.txt', 'zipball/bar.txt'] but we only
42 care about ['foo.txt', 'bar.txt'].
46 def Create(cls
, repo_name
, blob
):
48 zipball
= ZipFile(StringIO(blob
))
50 logging
.warning('zipball "%s" is not a valid zip' % repo_name
)
53 if not zipball
.namelist():
54 logging
.warning('zipball "%s" is empty' % repo_name
)
57 name_prefix
= None # probably 'zipball'
59 for name
in zipball
.namelist():
60 prefix
, path
= name
.split('/', 1)
61 if name_prefix
and prefix
!= name_prefix
:
62 logging
.warning('zipball "%s" has names with inconsistent prefix: %s' %
63 (repo_name
, zipball
.namelist()))
67 return cls(zipball
, name_prefix
, paths
)
69 def __init__(self
, zipball
, name_prefix
, paths
):
70 self
._zipball
= zipball
71 self
._name
_prefix
= name_prefix
75 '''Return all file paths in this zip file.
80 '''Returns all files within a directory at |path|. Not recursive. Paths
81 are returned relative to |path|.
83 AssertIsDirectory(path
)
84 return [p
[len(path
):] for p
in self
._paths
86 p
.startswith(path
) and
87 '/' not in p
[len(path
):].rstrip('/')]
90 '''Returns the contents of |path|. Raises a KeyError if it doesn't exist.
92 return self
._zipball
.read(posixpath
.join(self
._name
_prefix
, path
))
95 class GithubFileSystem(FileSystem
):
96 '''Allows reading from a github.com repository.
99 def Create(owner
, repo
, object_store_creator
):
100 '''Creates a GithubFileSystem that corresponds to a single github repository
101 specified by |owner| and |repo|.
103 return GithubFileSystem(
104 url_constants
.GITHUB_REPOS
,
107 object_store_creator
,
111 def ForTest(repo
, fake_fetcher
, path
=None, object_store_creator
=None):
112 '''Creates a GithubFileSystem that can be used for testing. It reads zip
113 files and commit data from server2/test_data/github_file_system/test_owner
114 instead of github.com. It reads from files specified by |repo|.
116 return GithubFileSystem(
117 path
if path
is not None else 'test_data/github_file_system',
120 object_store_creator
or ObjectStoreCreator
.ForTest(),
123 def __init__(self
, base_url
, owner
, repo
, object_store_creator
, Fetcher
):
124 self
._repo
_key
= posixpath
.join(owner
, repo
)
125 self
._repo
_url
= posixpath
.join(base_url
, owner
, repo
)
126 self
._username
, self
._password
= _LoadCredentials(object_store_creator
)
127 self
._blobstore
= blobstore
.AppEngineBlobstore()
128 self
._fetcher
= Fetcher(self
._repo
_url
)
129 # Stores whether the github is up-to-date. This will either be True or
130 # empty, the emptiness most likely due to this being a cron run.
131 self
._up
_to
_date
_cache
= object_store_creator
.Create(
132 GithubFileSystem
, category
='up-to-date')
133 # Caches the zip file's stat. Overrides start_empty=False and use
134 # |self._up_to_date_cache| to determine whether we need to refresh.
135 self
._stat
_cache
= object_store_creator
.Create(
136 GithubFileSystem
, category
='stat-cache', start_empty
=False)
138 # Created lazily in |_EnsureRepoZip|.
139 self
._repo
_zip
= None
141 def _EnsureRepoZip(self
):
142 '''Initializes |self._repo_zip| if it hasn't already been (i.e. if
143 _EnsureRepoZip has never been called before). In that case |self._repo_zip|
144 will be set to a Future of _GithubZipFile and the fetch process started,
145 whether that be from a blobstore or if necessary all the way from GitHub.
147 if self
._repo
_zip
is not None:
150 repo_key
, repo_url
, username
, password
= (
151 self
._repo
_key
, self
._repo
_url
, self
._username
, self
._password
)
153 def fetch_from_blobstore(version
):
154 '''Returns a Future which resolves to the _GithubZipFile for this repo
155 fetched from blobstore.
159 blob
= self
._blobstore
.Get(repo_url
, _GITHUB_REPOS_NAMESPACE
)
160 except blobstore
.BlobNotFoundError
:
164 logging
.warning('No blob for %s found in datastore' % repo_key
)
165 return fetch_from_github(version
)
167 repo_zip
= _GithubZipFile
.Create(repo_key
, blob
)
169 logging
.warning('Blob for %s was corrupted in blobstore!?' % repo_key
)
170 return fetch_from_github(version
)
172 return Future(value
=repo_zip
)
174 def fetch_from_github(version
):
175 '''Returns a Future which resolves to the _GithubZipFile for this repo
176 fetched new from GitHub, then writes it to blobstore and |version| to the
179 def get_zip(github_zip
):
181 blob
= github_zip
.content
182 except urlfetch
.DownloadError
:
183 raise FileSystemError('Failed to download repo %s file from %s' %
184 (repo_key
, repo_url
))
186 repo_zip
= _GithubZipFile
.Create(repo_key
, blob
)
188 raise FileSystemError('Blob for %s was fetched corrupted from %s' %
189 (repo_key
, repo_url
))
191 self
._blobstore
.Set(self
._repo
_url
, blob
, _GITHUB_REPOS_NAMESPACE
)
192 self
._up
_to
_date
_cache
.Set(repo_key
, True)
193 self
._stat
_cache
.Set(repo_key
, version
)
195 return self
._fetcher
.FetchAsync(
196 'zipball', username
=username
, password
=password
).Then(get_zip
)
198 # To decide whether we need to re-stat, and from there whether to re-fetch,
199 # make use of ObjectStore's start-empty configuration. If
200 # |object_store_creator| is configured to start empty then our creator
201 # wants to refresh (e.g. running a cron), so fetch the live stat from
202 # GitHub. If the stat hasn't changed since last time then no reason to
203 # re-fetch from GitHub, just take from blobstore.
205 cached_version
= self
._stat
_cache
.Get(repo_key
).Get()
206 if self
._up
_to
_date
_cache
.Get(repo_key
).Get() is None:
207 # This is either a cron or an instance where a cron has never been run.
208 live_version
= self
._FetchLiveVersion
(username
, password
)
209 if cached_version
!= live_version
:
210 # Note: branch intentionally triggered if |cached_version| is None.
211 logging
.info('%s has changed, fetching from GitHub.' % repo_url
)
212 self
._repo
_zip
= fetch_from_github(live_version
)
214 # Already up to date. Fetch from blobstore. No need to set up-to-date
215 # to True here since it'll already be set for instances, and it'll
216 # never be set for crons.
217 logging
.info('%s is up to date.' % repo_url
)
218 self
._repo
_zip
= fetch_from_blobstore(cached_version
)
220 # Instance where cron has been run. It should be in blobstore.
221 self
._repo
_zip
= fetch_from_blobstore(cached_version
)
223 assert self
._repo
_zip
is not None
225 def _FetchLiveVersion(self
, username
, password
):
226 '''Fetches the current repository version from github.com and returns it.
227 The version is a 'sha' hash value.
229 # TODO(kalman): Do this asynchronously (use FetchAsync).
230 result
= self
._fetcher
.Fetch(
231 'commits/HEAD', username
=username
, password
=password
)
234 return json
.loads(result
.content
)['sha']
235 except (KeyError, ValueError):
236 raise FileSystemError('Error parsing JSON from repo %s: %s' %
237 (self
._repo
_url
, traceback
.format_exc()))
240 return self
.ReadSingle('')
242 def Read(self
, paths
, skip_not_found
=False):
243 '''Returns a directory mapping |paths| to the contents of the file at each
244 path. If path ends with a '/', it is treated as a directory and is mapped to
245 a list of filenames in that directory.
247 self
._EnsureRepoZip
()
251 if path
not in repo_zip
.Paths():
252 raise FileNotFoundError('"%s": %s not found' % (self
._repo
_key
, path
))
253 if IsDirectory(path
):
254 reads
[path
] = repo_zip
.List(path
)
256 reads
[path
] = repo_zip
.Read(path
)
258 return self
._repo
_zip
.Then(read
)
260 def Stat(self
, path
):
261 '''Stats |path| returning its version as as StatInfo object. If |path| ends
262 with a '/', it is assumed to be a directory and the StatInfo object returned
263 includes child_versions for all paths in the directory.
265 File paths do not include the name of the zip file, which is arbitrary and
266 useless to consumers.
268 Because the repository will only be downloaded once per server version, all
269 stat versions are always 0.
271 self
._EnsureRepoZip
()
272 repo_zip
= self
._repo
_zip
.Get()
274 if path
not in repo_zip
.Paths():
275 raise FileNotFoundError('"%s" does not contain file "%s"' %
276 (self
._repo
_key
, path
))
278 version
= self
._stat
_cache
.Get(self
._repo
_key
).Get()
279 assert version
is not None, ('There was a zipball in datastore; there '
280 'should be a version cached for it')
282 stat_info
= StatInfo(version
)
283 if IsDirectory(path
):
284 stat_info
.child_versions
= dict((p
, StatInfo(version
))
285 for p
in repo_zip
.List(path
))
288 def GetIdentity(self
):
289 return '%s' % StringIdentity(self
.__class
__.__name
__ + self
._repo
_key
)
292 return '%s(key=%s, url=%s)' % (type(self
).__name
__,