Skip to content
Snippets Groups Projects

Log more failures and fix a bug

Merged Bryan R requested to merge b/log-more-failures into master
1 file
+ 35
12
Compare changes
  • Side-by-side
  • Inline
@@ -50,7 +50,9 @@ class GHCPerfWebhookServer(WebhookServer):
@@ -50,7 +50,9 @@ class GHCPerfWebhookServer(WebhookServer):
# Fire off spurious failure processing
# Fire off spurious failure processing
t = threading.Thread( \
t = threading.Thread( \
target=self._process_spurious_failures, \
target=self._process_spurious_failures, \
args=(event['build_id'], event['build_finished_at']))
args=(event['build_id'], \
 
event['build_started_at'], \
 
event['repository']['homepage']))
t.start()
t.start()
proj_id = event['project_id']
proj_id = event['project_id']
@@ -159,7 +161,7 @@ class GHCPerfWebhookServer(WebhookServer):
@@ -159,7 +161,7 @@ class GHCPerfWebhookServer(WebhookServer):
with ZipFile(archive_path) as archive:
with ZipFile(archive_path) as archive:
archive.extractall(path=out_dir)
archive.extractall(path=out_dir)
def _process_spurious_failures(self, job_id: int, date: datetime) -> None:
def _process_spurious_failures(self, job_id: int, date: datetime, project_url: str) -> None:
"""
"""
Adds a record of known spurious failures to the ci_failure table.
Adds a record of known spurious failures to the ci_failure table.
"""
"""
@@ -171,31 +173,52 @@ class GHCPerfWebhookServer(WebhookServer):
@@ -171,31 +173,52 @@ class GHCPerfWebhookServer(WebhookServer):
## Furthermore, the API endpoint for job logs is unbearably slow for
## Furthermore, the API endpoint for job logs is unbearably slow for
## large logs (and all our logs are large). We have to go through the
## large logs (and all our logs are large). We have to go through the
## user API, not the machine API.
## user API, not the machine API.
raw_url = f'https://gitlab.haskell.org/ghc/ghc/-/jobs/{job_id}/raw'
 
raw_url = f'{project_url}/-/jobs/{job_id}/raw'
resp = requests.get(raw_url, headers={'user-agent': GHCPerfWebhookServer.user_agent})
resp = requests.get(raw_url, headers={'user-agent': GHCPerfWebhookServer.user_agent})
 
if re.search('users/sign_in$', resp.url):
 
logging.error(f'{raw_url} got redirected to login -- probably a 404')
 
return
trace = resp.content
trace = resp.content
# Identify the failure types, if any
# Identify the failure types, if any
failures = []
failures = []
if re.search(b'Cannot connect to the Docker daemon at unix:///var/run/docker.sock', trace):
logging.info(f'Docker failure detected in job {job_id}')
grep = lambda s: re.search(s, trace)
 
joblog = lambda s: logging.info(f'job {job_id}: {s}')
 
 
if grep(b'Cannot connect to the Docker daemon at unix:///var/run/docker.sock'):
 
joblog('docker failure')
failures.append('docker')
failures.append('docker')
 
if grep(b'Error response from daemon: \w+ "https://registry.gitlab.haskell.org') \
 
or grep(b'Error response from daemon: manifest for registry.gitlab.haskell.org'):
 
joblog('image pull failure')
 
failures.append('pull_image')
 
if grep(b'Failed to connect to gitlab.haskell.org'):
 
joblog('GitLab connection failure')
 
failures.append('gitlab_connect')
 
if grep(b'No space left on device'):
 
joblog('exhausted disk')
 
failures.append('no_space')
 
if grep(b'failed due to signal 9 .Killed'):
 
joblog('received signal 9')
 
failures.append('signal_9')
## ... others to be added...
## ... others to be added...
# Record our failures
# Record our failures
if len(failures) == 0:
if len(failures) == 0:
logging.info(f'No failures detected for job {job_id}.')
joblog(f'No failures detected')
return
return
## For idempotency and re-runnability, this deletes all previous
## results for this job_id!
with psycopg2.connect(self.conn_string) as db:
with psycopg2.connect(self.conn_string) as db:
cur = db.cursor()
cur = db.cursor()
cur.execute('delete from ci_failure where job_id = %s', (job_id,))
values = map(lambda f: (job_id, f, date), failures)
values = map(lambda f: (job_id, f, date), failures)
cur.executemany('insert into ci_failure (job_id, type, job_date) values (%s, %s, %s)', values)
cur.executemany('''
 
insert into ci_failure (job_id, type, job_date)
 
values (%s, %s, %s)
 
on conflict do nothing
 
''', values)
def main() -> None:
def main() -> None:
import argparse
import argparse
@@ -240,7 +263,7 @@ def main() -> None:
@@ -240,7 +263,7 @@ def main() -> None:
raise Exception('--failcheck-project must be specified with --failcheck')
raise Exception('--failcheck-project must be specified with --failcheck')
project = gl.projects.get(args.failcheck_project)
project = gl.projects.get(args.failcheck_project)
job = project.jobs.get(args.failcheck)
job = project.jobs.get(args.failcheck)
server._process_spurious_failures(args.failcheck, job.finished_at)
server._process_spurious_failures(args.failcheck, job.started_at, project.web_url)
else:
else:
server.run()
server.run()
Loading