Skip to content
Snippets Groups Projects

Changes in Artifact files and presentation url

1 file
+ 52
25
Compare changes
  • Side-by-side
  • Inline
import sys
import six
import os
import logging
import dateutil.parser
import datetime
import urllib
import requests
from future.utils import raise_from
from urllib.parse import urlparse
@@ -57,34 +59,48 @@ class USENIXImporter(BaseImporter):
abst = soup.find('div',class_='field-name-field-paper-description')
if not abst:
LOG.warn("no abstract in data")
abstract = abst.find('div' , class_='field-item odd').text
else:
abstract = abst.find('div' , class_='field-item odd').text
bib_text = soup.find('div', class_='bibtex-text-entry').text
bib_dict={}
if not bib_text:
LOG.warn("no bibtext in metadata")
bib_database = bibtexparser.loads(bib_text)
bib_dict = bib_database.entries[0]
else:
bib_database = bibtexparser.loads(bib_text)
bib_dict = bib_database.entries[0]
affiliations = []
org_dict = {}
authors = soup.find('div', class_='field-name-field-paper-people-text').find('div' , class_='field-item odd')
author_list = authors.find('p').text.split(';')
for i in author_list:
author = i.split(',')
org_name = author[-1].strip()
if(org_name in org_dict):
org = org_dict[org_name]
else:
org = Organization(name = org_name, type="Institution")
org_dict[org_name] = org
for a in range(len(author)-1):
if(author[a].startswith(" and ")):
names = author[a].split()
person = Person(name=names[1].strip())
affiliations.append(ArtifactAffiliation(affiliation=Affiliation(person=person,org=org,roles="Author")))
else:
person = Person(name=author[a].strip())
affiliations.append(ArtifactAffiliation(affiliation=Affiliation(person=person,org=org,roles="Author")))
authors = soup.find('div', class_='field-name-field-paper-people-text')
if authors:
authors = authors.find('div' , class_='field-item odd')
author_list = authors.find('p').text.split(';')
for i in author_list:
author = i.split(',')
org_name = author[-1].strip()
if(org_name in org_dict):
org = org_dict[org_name]
else:
org = Organization(name = org_name, type="Institution")
org_dict[org_name] = org
for a in range(len(author)-1):
if(author[a].startswith(" and ")):
names = author[a].split()
person = Person(name=names[1].strip())
affiliations.append(ArtifactAffiliation(affiliation=Affiliation(person=person,org=org,roles="Author")))
else:
person = Person(name=author[a].strip())
affiliations.append(ArtifactAffiliation(affiliation=Affiliation(person=person,org=org,roles="Author")))
if not affiliations:
if('author' in bib_dict):
authors = bib_dict['author'].split('and')
for a in authors:
affiliations.append(ArtifactAffiliation(affiliation=Affiliation(person=Person(name = a.strip()),roles="Author")))
else:
LOG.warn("no authors in metadata")
meta_data = {}
if 'isbn' in bib_dict:
meta_data['ISBN'] = bib_dict['isbn']
@@ -114,6 +130,9 @@ class USENIXImporter(BaseImporter):
presentation_video= soup.find('div', class_='embedded-video')
if presentation_video:
presentation_video_link = presentation_video.find('iframe')['src']
parsed_url = urlparse(presentation_video_link)
if not parsed_url.scheme:
presentation_video_link = "https:" + presentation_video_link
metadata.append(ArtifactMetadata(
name="presentation_video", value=str(presentation_video_link), type="text/json",
source="usenix"))
@@ -122,17 +141,25 @@ class USENIXImporter(BaseImporter):
presentation_pdf = soup.find('div', class_='field-name-field-presentation-pdf')
if presentation_pdf:
presentation_pdf_link = presentation_pdf.find('a')['href']
path = urlparse(presentation_pdf_link).path
ext = os.path.splitext(path)
typ = ext[1][1:]
name = ext[0].split('/')[-1]+ext[1]
artifact_files.append(ArtifactFile(
name="presentation_pdf", url=presentation_pdf_link, filetype="application/pdf"))
name=name, url=presentation_pdf_link, filetype="application/"+typ))
presentation_slides= soup.find('div', class_ = 'field-name-field-paper-slides-file')
if presentation_slides:
presentation_slides_link = presentation_slides.find('img')['src']
presentation_slides_link = presentation_slides.find('a')['href']
path = urlparse(presentation_slides_link).path
ext = os.path.splitext(path)
typ = ext[1][1:]
name = ext[0].split('/')[-1]+ext[1]
artifact_files.append(ArtifactFile(
name="presentation_slides", url=presentation_slides_link, filetype="application/pdf"))
name=name, url=presentation_slides_link, filetype="application/"+typ))
return Artifact(
type="publication",url=url,title=title,description=abstract,
name=title,ctime=datetime.datetime.now(),ext_id=url,
owner=self.owner_object,importer=self.importer_object,
tags=[],meta=metadata,files = artifact_files,affiliations=affiliations,parent=parent)
\ No newline at end of file
tags=[],meta=metadata,files = artifact_files,affiliations=affiliations,parent=parent)
Loading