initial commit

This commit is contained in:
Nathan Adams 2015-09-08 21:56:08 -05:00
commit 88d65df2bb
11 changed files with 443 additions and 0 deletions

36
Issue.py Normal file
View File

@ -0,0 +1,36 @@
class IssueComment(object):
author = ""
title = ""
summary = ""
date = ""
def getAuthor(self):
return self.author
def getTitle(self):
return self.title
def getSummary(self):
return self.summary
def getDate(self):
return self.date
class Issue(IssueComment):
status = ""
comments = []
id = -1
def getId(self):
return self.id
def getStatus(self):
return self.status
def getCommnets(self):
return self.comments

26
Project.py Normal file
View File

@ -0,0 +1,26 @@
from enum import enum
REPO_TYPES = enum("SVN", "git", "hg", "NA")
class Project(object):
repoURL = ""
releases = []
issues = []
wikis = []
repoType = REPO_TYPES.NA
def getRepoURL(self):
return self.repoURL
def getReleases(self):
return self.releases
def getIssues(self):
return self.issues
def getRepoType(self):
return self.repoType
def getWikis(self):
return self.wikis

52
README.txt Normal file
View File

@ -0,0 +1,52 @@
# codescrape
Version 1.0
By: Nathan Adams
License: MIT
## Description
This library is to be used to archive project data. Since with the announcement of Google Code going to archive only - I wanted to create a library where you can grab source data before it is gone forever.
Use cases include:
Archive projects due to:
- Hosting service shutting down
- Authorities sending cease-and-desist against provider/project
- Historical/research/ or educational purposes
## Usage
Currently srchub and google code are supported. To use:
from services.srchub import srchub
shub = srchub()
projects = shub.getProjects()
or for google code
from services.googlecode import googlecode
gcode = googlecode()
project = gcode.getProject("android-python27")
Sourcehub library will pull all public projects since this list is easily accessed. Google Code does not have a public list persay. And I didn't want to scrape the search results, so I developed it to require you to pass in the project name. If you were to get your hands on a list of google code projects you could easily loop through them:
from services.googlecode import googlecode
gcode = googlecode()
for project in someProjectList:
project = gcode.getProject(project)
# do something with project
the project data structure is as follows:
project
- getRepoURL() -> Returns the URL of the repo
- getRepoType() -> Returns the type of repo (git, hg, or SVN)
- getReleases() -> Returns all downloads related to the project
- getIssues() -> Returns open issues
- getWikis() -> Returns wikis

18
Release.py Normal file
View File

@ -0,0 +1,18 @@
class Release(object):
fileName = ""
summary = ""
fileURL = ""
checksum = None
def getFileName(self):
return self.fileName
def getSummary(self):
return self.summary
def getFileURL(self):
return self.fileURL
def getChecksum(self):
return self.checksum

41
Service.py Normal file
View File

@ -0,0 +1,41 @@
import pycurl
try:
from cStringIO import StringIO
except ImportError:
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
from urllib import urlencode
class Service(object):
def getProjects(self):
pass
def curl_post(self, url, postvals, header = []):
buffer = StringIO()
cobj = pycurl.Curl()
cobj.setopt(pycurl.URL, url)
cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
cobj.setopt(pycurl.POST, 1)
cobj.setopt(pycurl.WRITEDATA, buffer)
postdata = urlencode(postvals)
cobj.setopt(pycurl.POSTFIELDS, postdata)
cobj.setopt(pycurl.HTTPHEADER, header)
cobj.perform()
cobj.close()
return buffer
def curl_get(self, url, header = []):
buffer = StringIO()
cobj = pycurl.Curl()
cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
cobj.setopt(pycurl.URL, url)
cobj.setopt(pycurl.WRITEDATA, buffer)
cobj.setopt(pycurl.HTTPHEADER, header)
cobj.perform()
cobj.close()
return buffer

22
Wiki.py Normal file
View File

@ -0,0 +1,22 @@
class Wiki(object):
pageName = ""
htmlContent = ""
textContent = ""
summary = ""
updated = ""
def getPageName(self):
return self.pageName
def getHTMLCotnent(self):
return self.htmlContent
def getTextContent(self):
return self.textContent
def getSummary(self):
return self.summary
def getUpdated(self):
return self.updated

7
enum.py Normal file
View File

@ -0,0 +1,7 @@
# Pythonic way to do enums:
# http://stackoverflow.com/a/1695250/195722
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
reverse = dict((value, key) for key, value in enums.iteritems())
enums['val'] = reverse
return type('Enum', (), enums)

8
main.py Normal file
View File

@ -0,0 +1,8 @@
from services.srchub import srchub
from services.googlecode import googlecode
#shub = srchub()
#projects = shub.getProjects()
gcode = googlecode()
project = gcode.getProject("android-python27")

1
services/__init__.py Normal file
View File

@ -0,0 +1 @@
__author__ = 'nathan'

113
services/googlecode.py Normal file
View File

@ -0,0 +1,113 @@
from Service import Service
from Project import REPO_TYPES, Project
from Release import Release
from Issue import IssueComment, Issue
from Wiki import Wiki
from bs4 import BeautifulSoup
class googlecode(Service):
DOMAIN = "https://code.google.com"
# Since I want to stay on Google's good side
# I'm going to write this method to parse a single project
# You will need to provide your own project list to roll through
# Such a list exists (although incomplete)
# http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2
def getProject(self, projectName):
project = Project()
sourceType = None
projectURL = self.DOMAIN + "/p/" + projectName + "/"
projectpageHTML = self.curl_get(projectURL).getvalue()
projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")
sourceSoupText = sourceSoup.get_text()
# get source
if "git clone" in sourceSoupText:
project.repoType = REPO_TYPES.git
project.repoURL = "https://code.google.com/p/" + projectName + "/"
elif "svn co" in sourceSoupText:
project.repoType = REPO_TYPES.SVN
project.repoURL = "http://" + projectName + ".googlecode.com/svn/"
else:
project.repoType = REPO_TYPES.hg
project.repoURL = "https://code.google.com/p/" + projectName + "/"
# get downloads
project.releases = []
downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")
downloadSection = downlaodsSoup.find("table", "results")
if "Your search did not generate any results." not in downlaodsSoup.get_text():
downloadRows = downloadSection.find_all("tr")[1:]
for downloadRow in downloadRows:
cols = downloadRow.find_all("td")
downloadTD = cols[1]
downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")
fileName = downloadTD.a.text.replace("\n", "").strip(" ")
release = Release()
release.fileURL = downloadURL
release.fileName = fileName
project.releases.append(release)
# get issues
project.issues = []
issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")
if "Your search did not generate any results." not in issuesSoup.get_text():
issuesSection = issuesSoup.find("table", "results")
for issueRow in issuesSection.find_all("tr")[1:]:
issue = Issue()
cols = issueRow.find_all("td")
issueId = cols[1].text.replace("\n", "").strip()
issueURL = projectURL + "issues/detail?id=" + issueId
issueStatus = cols[3].text.replace("\n", "").strip(" ")
issueSummary = cols[8].text.replace("\n", "")
issueTitle = cols[8].text.replace("\n", "")
issueAuthor = cols[5].text.replace("\n", "")
#issue.author = issueAuthor
issue.comments = []
issue.status = issueStatus.strip(" ")
issue.summary = issueSummary.strip(" ")
issue.title = issueTitle
issue.id = issueId
# we must go deeper to get comments
issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
for comment in issueComments.find_all("div", "vt"):
#author = comment.find(class_="author").find("a").text
author = (comment.find(class_="author").find_all("a")[-1]).contents
date = comment.find("span", "date")["title"]
commentText = comment.find("pre").get_text()
issueComment = IssueComment()
issueComment.date = date
issueComment.author = author
issueComment.summary = commentText
issue.comments.append(issueComment)
project.issues.append(issue)
# get wiki pages
project.wikis = []
wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")
if "Your search did not generate any results." not in wikiSoup.get_text():
wikiSection = wikiSoup.find("table", "results")
for wikiRow in wikiSection.find_all("tr")[1:]:
wiki = Wiki()
cols = wikiRow.find_all("td")
wiki.pageName = cols[1].text.replace("\n", "").strip(" ")
wiki.summary = cols[2].text.replace("\n", "").strip(" ")
wiki.updated = cols[3].text.replace("\n", "").strip(" ")
wikiURL = projectURL + "wiki/" + wiki.pageName
wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
wikiContent = wikiPageSoup.find(id="wikicontent")
wiki.htmlContent = wikiContent.prettify()
wiki.textContent = wikiContent.get_text()
project.wikis.append(wiki)
return project

119
services/srchub.py Normal file
View File

@ -0,0 +1,119 @@
from Service import Service
from Project import REPO_TYPES, Project
from Release import Release
from Issue import IssueComment, Issue
from Wiki import Wiki
from bs4 import BeautifulSoup
class srchub(Service):
DOMAIN = "https://beta.datanethost.net"
def getProjects(self):
# Perhaps I should provide more API endpoints to make scraping easier...
projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue()
soup = BeautifulSoup(projectlist, "html.parser")
links = soup.find("ul", "prjlistclass")
projects = []
for link in links.find_all("a"):
project = Project()
sourceType = None
projectURL = self.DOMAIN + link.get("href")
projectName = projectURL.split("/")[-2]
projectpageHTML = self.curl_get(projectURL).getvalue()
projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser")
sourceSoupText = sourceSoup.get_text()
# get source
if "git clone" in sourceSoupText:
project.repoType = REPO_TYPES.git
project.repoURL = "git://" + self.DOMAIN + "/" + projectName + ".git"
elif "svn co" in sourceSoupText:
project.repoType = REPO_TYPES.SVN
project.repoURL = "https://" + self.DOMAIN + "/svn/" + projectName + "/"
else:
project.repoType = REPO_TYPES.hg
project.repoURL = "https://" + self.DOMAIN + "/hg/" + projectName + "/"
# get downloads
project.releases = []
downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser")
downloadSection = downlaodsSoup.find("table", "uploads")
if "No downloads were found." not in downlaodsSoup.get_text():
downloadRows = downloadSection.find_all("tr")[1:]
for downloadRow in downloadRows:
cols = downloadRow.find_all("td")
downloadTD = cols[0]
downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text
fileName = downloadTD.a.text
release = Release()
release.fileURL = downloadURL
release.fileName = fileName
project.releases.append(release)
# get issues
project.issues = []
issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser")
if "No issues were found." not in issuesSoup.get_text():
issuesSection = issuesSoup.find("table", "recent-issues")
for issueRow in issuesSection.find_all("tr")[1:]:
issue = Issue()
cols = issueRow.find_all("td")
issueId = cols[0].text
issueURL = projectURL + "issues/" + issueId + "/"
issueStatus = cols[2].text
issueSummary = cols[1].text
issueTitle = cols[1].find("a").text
issueAuthor = cols[3].text
issue.author = issueAuthor
issue.comments = []
issue.status = issueStatus
issue.summary = issueSummary
issue.title = issueTitle
issue.id = issueId
# we must go deeper to get comments
issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
for comment in issueComments.find_all("div", "issue-comment"):
author = comment.find("p").get_text().split("by")[1].split(",")[0]
date = comment.find("span").get_text()
commentText = comment.find("pre").get_text()
issueComment = IssueComment()
issueComment.date = date
issueComment.author = author
issueComment.summary = commentText
issue.comments.append(issueComment)
project.issues.append(issue)
# get wiki pages
project.wikis = []
wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser")
if "No documentation pages were found." not in wikiSoup.get_text():
wikiSection = wikiSoup.find("table", "recent-issues")
for wikiRow in wikiSection.find_all("tr")[1:]:
wiki = Wiki()
cols = wikiRow.find_all("td")
wiki.pageName = cols[0].text
wiki.summary = cols[1].text
wiki.updated = cols[2].text
wikiURL = projectURL + "page/" + wiki.pageName + "/"
wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
wikiContent = wikiPageSoup.find(id="wiki-content")
wiki.htmlContent = wikiContent.prettify()
wiki.textContent = wikiContent.get_text()
project.wikis.append(wiki)
projects.append(project)
return projects