initial commit
This commit is contained in:
commit
88d65df2bb
36
Issue.py
Normal file
36
Issue.py
Normal file
@ -0,0 +1,36 @@
|
||||
class IssueComment(object):
|
||||
|
||||
|
||||
author = ""
|
||||
title = ""
|
||||
summary = ""
|
||||
date = ""
|
||||
|
||||
def getAuthor(self):
|
||||
return self.author
|
||||
|
||||
def getTitle(self):
|
||||
return self.title
|
||||
|
||||
def getSummary(self):
|
||||
return self.summary
|
||||
|
||||
def getDate(self):
|
||||
return self.date
|
||||
|
||||
|
||||
|
||||
class Issue(IssueComment):
|
||||
|
||||
status = ""
|
||||
comments = []
|
||||
id = -1
|
||||
|
||||
def getId(self):
|
||||
return self.id
|
||||
|
||||
def getStatus(self):
|
||||
return self.status
|
||||
|
||||
def getCommnets(self):
|
||||
return self.comments
|
26
Project.py
Normal file
26
Project.py
Normal file
@ -0,0 +1,26 @@
|
||||
from enum import enum
|
||||
|
||||
REPO_TYPES = enum("SVN", "git", "hg", "NA")
|
||||
|
||||
class Project(object):
|
||||
|
||||
repoURL = ""
|
||||
releases = []
|
||||
issues = []
|
||||
wikis = []
|
||||
repoType = REPO_TYPES.NA
|
||||
|
||||
def getRepoURL(self):
|
||||
return self.repoURL
|
||||
|
||||
def getReleases(self):
|
||||
return self.releases
|
||||
|
||||
def getIssues(self):
|
||||
return self.issues
|
||||
|
||||
def getRepoType(self):
|
||||
return self.repoType
|
||||
|
||||
def getWikis(self):
|
||||
return self.wikis
|
52
README.txt
Normal file
52
README.txt
Normal file
@ -0,0 +1,52 @@
|
||||
# codescrape
|
||||
|
||||
Version 1.0
|
||||
|
||||
By: Nathan Adams
|
||||
|
||||
License: MIT
|
||||
|
||||
## Description
|
||||
|
||||
This library is to be used to archive project data. Since with the announcement of Google Code going to archive only - I wanted to create a library where you can grab source data before it is gone forever.
|
||||
|
||||
Use cases include:
|
||||
|
||||
Archive projects due to:
|
||||
|
||||
- Hosting service shutting down
|
||||
- Authorities sending cease-and-desist against provider/project
|
||||
- Historical/research/ or educational purposes
|
||||
|
||||
## Usage
|
||||
|
||||
Currently srchub and google code are supported. To use:
|
||||
|
||||
from services.srchub import srchub
|
||||
shub = srchub()
|
||||
projects = shub.getProjects()
|
||||
|
||||
or for google code
|
||||
|
||||
from services.googlecode import googlecode
|
||||
gcode = googlecode()
|
||||
project = gcode.getProject("android-python27")
|
||||
|
||||
Sourcehub library will pull all public projects since this list is easily accessed. Google Code does not have a public list persay. And I didn't want to scrape the search results, so I developed it to require you to pass in the project name. If you were to get your hands on a list of google code projects you could easily loop through them:
|
||||
|
||||
from services.googlecode import googlecode
|
||||
gcode = googlecode()
|
||||
for project in someProjectList:
|
||||
project = gcode.getProject(project)
|
||||
# do something with project
|
||||
|
||||
the project data structure is as follows:
|
||||
|
||||
project
|
||||
|
||||
- getRepoURL() -> Returns the URL of the repo
|
||||
- getRepoType() -> Returns the type of repo (git, hg, or SVN)
|
||||
- getReleases() -> Returns all downloads related to the project
|
||||
- getIssues() -> Returns open issues
|
||||
- getWikis() -> Returns wikis
|
||||
|
18
Release.py
Normal file
18
Release.py
Normal file
@ -0,0 +1,18 @@
|
||||
class Release(object):
|
||||
|
||||
fileName = ""
|
||||
summary = ""
|
||||
fileURL = ""
|
||||
checksum = None
|
||||
|
||||
def getFileName(self):
|
||||
return self.fileName
|
||||
|
||||
def getSummary(self):
|
||||
return self.summary
|
||||
|
||||
def getFileURL(self):
|
||||
return self.fileURL
|
||||
|
||||
def getChecksum(self):
|
||||
return self.checksum
|
41
Service.py
Normal file
41
Service.py
Normal file
@ -0,0 +1,41 @@
|
||||
import pycurl
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
try:
|
||||
from StringIO import StringIO
|
||||
except ImportError:
|
||||
from io import StringIO
|
||||
from urllib import urlencode
|
||||
|
||||
class Service(object):
|
||||
|
||||
def getProjects(self):
|
||||
pass
|
||||
|
||||
def curl_post(self, url, postvals, header = []):
|
||||
buffer = StringIO()
|
||||
cobj = pycurl.Curl()
|
||||
cobj.setopt(pycurl.URL, url)
|
||||
cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
|
||||
cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
|
||||
cobj.setopt(pycurl.POST, 1)
|
||||
cobj.setopt(pycurl.WRITEDATA, buffer)
|
||||
postdata = urlencode(postvals)
|
||||
cobj.setopt(pycurl.POSTFIELDS, postdata)
|
||||
cobj.setopt(pycurl.HTTPHEADER, header)
|
||||
cobj.perform()
|
||||
cobj.close()
|
||||
return buffer
|
||||
|
||||
def curl_get(self, url, header = []):
|
||||
buffer = StringIO()
|
||||
cobj = pycurl.Curl()
|
||||
cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
|
||||
cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
|
||||
cobj.setopt(pycurl.URL, url)
|
||||
cobj.setopt(pycurl.WRITEDATA, buffer)
|
||||
cobj.setopt(pycurl.HTTPHEADER, header)
|
||||
cobj.perform()
|
||||
cobj.close()
|
||||
return buffer
|
22
Wiki.py
Normal file
22
Wiki.py
Normal file
@ -0,0 +1,22 @@
|
||||
class Wiki(object):
|
||||
|
||||
pageName = ""
|
||||
htmlContent = ""
|
||||
textContent = ""
|
||||
summary = ""
|
||||
updated = ""
|
||||
|
||||
def getPageName(self):
|
||||
return self.pageName
|
||||
|
||||
def getHTMLCotnent(self):
|
||||
return self.htmlContent
|
||||
|
||||
def getTextContent(self):
|
||||
return self.textContent
|
||||
|
||||
def getSummary(self):
|
||||
return self.summary
|
||||
|
||||
def getUpdated(self):
|
||||
return self.updated
|
7
enum.py
Normal file
7
enum.py
Normal file
@ -0,0 +1,7 @@
|
||||
# Pythonic way to do enums:
|
||||
# http://stackoverflow.com/a/1695250/195722
|
||||
def enum(*sequential, **named):
|
||||
enums = dict(zip(sequential, range(len(sequential))), **named)
|
||||
reverse = dict((value, key) for key, value in enums.iteritems())
|
||||
enums['val'] = reverse
|
||||
return type('Enum', (), enums)
|
8
main.py
Normal file
8
main.py
Normal file
@ -0,0 +1,8 @@
|
||||
from services.srchub import srchub
|
||||
from services.googlecode import googlecode
|
||||
|
||||
#shub = srchub()
|
||||
|
||||
#projects = shub.getProjects()
|
||||
gcode = googlecode()
|
||||
project = gcode.getProject("android-python27")
|
1
services/__init__.py
Normal file
1
services/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
__author__ = 'nathan'
|
113
services/googlecode.py
Normal file
113
services/googlecode.py
Normal file
@ -0,0 +1,113 @@
|
||||
from Service import Service
|
||||
from Project import REPO_TYPES, Project
|
||||
from Release import Release
|
||||
from Issue import IssueComment, Issue
|
||||
from Wiki import Wiki
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class googlecode(Service):
|
||||
|
||||
DOMAIN = "https://code.google.com"
|
||||
|
||||
# Since I want to stay on Google's good side
|
||||
# I'm going to write this method to parse a single project
|
||||
# You will need to provide your own project list to roll through
|
||||
# Such a list exists (although incomplete)
|
||||
# http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2
|
||||
def getProject(self, projectName):
|
||||
project = Project()
|
||||
sourceType = None
|
||||
projectURL = self.DOMAIN + "/p/" + projectName + "/"
|
||||
|
||||
projectpageHTML = self.curl_get(projectURL).getvalue()
|
||||
projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
|
||||
|
||||
sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
|
||||
sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")
|
||||
sourceSoupText = sourceSoup.get_text()
|
||||
|
||||
# get source
|
||||
if "git clone" in sourceSoupText:
|
||||
project.repoType = REPO_TYPES.git
|
||||
project.repoURL = "https://code.google.com/p/" + projectName + "/"
|
||||
elif "svn co" in sourceSoupText:
|
||||
project.repoType = REPO_TYPES.SVN
|
||||
project.repoURL = "http://" + projectName + ".googlecode.com/svn/"
|
||||
else:
|
||||
project.repoType = REPO_TYPES.hg
|
||||
project.repoURL = "https://code.google.com/p/" + projectName + "/"
|
||||
|
||||
|
||||
# get downloads
|
||||
project.releases = []
|
||||
downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")
|
||||
downloadSection = downlaodsSoup.find("table", "results")
|
||||
if "Your search did not generate any results." not in downlaodsSoup.get_text():
|
||||
downloadRows = downloadSection.find_all("tr")[1:]
|
||||
for downloadRow in downloadRows:
|
||||
cols = downloadRow.find_all("td")
|
||||
downloadTD = cols[1]
|
||||
downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")
|
||||
fileName = downloadTD.a.text.replace("\n", "").strip(" ")
|
||||
release = Release()
|
||||
release.fileURL = downloadURL
|
||||
release.fileName = fileName
|
||||
project.releases.append(release)
|
||||
|
||||
# get issues
|
||||
project.issues = []
|
||||
issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")
|
||||
if "Your search did not generate any results." not in issuesSoup.get_text():
|
||||
issuesSection = issuesSoup.find("table", "results")
|
||||
for issueRow in issuesSection.find_all("tr")[1:]:
|
||||
issue = Issue()
|
||||
cols = issueRow.find_all("td")
|
||||
issueId = cols[1].text.replace("\n", "").strip()
|
||||
issueURL = projectURL + "issues/detail?id=" + issueId
|
||||
issueStatus = cols[3].text.replace("\n", "").strip(" ")
|
||||
issueSummary = cols[8].text.replace("\n", "")
|
||||
issueTitle = cols[8].text.replace("\n", "")
|
||||
issueAuthor = cols[5].text.replace("\n", "")
|
||||
|
||||
#issue.author = issueAuthor
|
||||
issue.comments = []
|
||||
issue.status = issueStatus.strip(" ")
|
||||
issue.summary = issueSummary.strip(" ")
|
||||
issue.title = issueTitle
|
||||
issue.id = issueId
|
||||
|
||||
# we must go deeper to get comments
|
||||
issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
|
||||
for comment in issueComments.find_all("div", "vt"):
|
||||
#author = comment.find(class_="author").find("a").text
|
||||
author = (comment.find(class_="author").find_all("a")[-1]).contents
|
||||
date = comment.find("span", "date")["title"]
|
||||
commentText = comment.find("pre").get_text()
|
||||
issueComment = IssueComment()
|
||||
issueComment.date = date
|
||||
issueComment.author = author
|
||||
issueComment.summary = commentText
|
||||
issue.comments.append(issueComment)
|
||||
|
||||
project.issues.append(issue)
|
||||
|
||||
# get wiki pages
|
||||
project.wikis = []
|
||||
wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")
|
||||
if "Your search did not generate any results." not in wikiSoup.get_text():
|
||||
wikiSection = wikiSoup.find("table", "results")
|
||||
for wikiRow in wikiSection.find_all("tr")[1:]:
|
||||
wiki = Wiki()
|
||||
cols = wikiRow.find_all("td")
|
||||
wiki.pageName = cols[1].text.replace("\n", "").strip(" ")
|
||||
wiki.summary = cols[2].text.replace("\n", "").strip(" ")
|
||||
wiki.updated = cols[3].text.replace("\n", "").strip(" ")
|
||||
wikiURL = projectURL + "wiki/" + wiki.pageName
|
||||
wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
|
||||
wikiContent = wikiPageSoup.find(id="wikicontent")
|
||||
wiki.htmlContent = wikiContent.prettify()
|
||||
wiki.textContent = wikiContent.get_text()
|
||||
project.wikis.append(wiki)
|
||||
|
||||
return project
|
119
services/srchub.py
Normal file
119
services/srchub.py
Normal file
@ -0,0 +1,119 @@
|
||||
from Service import Service
|
||||
from Project import REPO_TYPES, Project
|
||||
from Release import Release
|
||||
from Issue import IssueComment, Issue
|
||||
from Wiki import Wiki
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
class srchub(Service):
|
||||
|
||||
DOMAIN = "https://beta.datanethost.net"
|
||||
|
||||
def getProjects(self):
|
||||
# Perhaps I should provide more API endpoints to make scraping easier...
|
||||
projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue()
|
||||
soup = BeautifulSoup(projectlist, "html.parser")
|
||||
links = soup.find("ul", "prjlistclass")
|
||||
projects = []
|
||||
for link in links.find_all("a"):
|
||||
project = Project()
|
||||
sourceType = None
|
||||
projectURL = self.DOMAIN + link.get("href")
|
||||
projectName = projectURL.split("/")[-2]
|
||||
|
||||
projectpageHTML = self.curl_get(projectURL).getvalue()
|
||||
projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
|
||||
|
||||
sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
|
||||
sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser")
|
||||
sourceSoupText = sourceSoup.get_text()
|
||||
|
||||
# get source
|
||||
if "git clone" in sourceSoupText:
|
||||
project.repoType = REPO_TYPES.git
|
||||
project.repoURL = "git://" + self.DOMAIN + "/" + projectName + ".git"
|
||||
elif "svn co" in sourceSoupText:
|
||||
project.repoType = REPO_TYPES.SVN
|
||||
project.repoURL = "https://" + self.DOMAIN + "/svn/" + projectName + "/"
|
||||
else:
|
||||
project.repoType = REPO_TYPES.hg
|
||||
project.repoURL = "https://" + self.DOMAIN + "/hg/" + projectName + "/"
|
||||
|
||||
|
||||
# get downloads
|
||||
project.releases = []
|
||||
downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser")
|
||||
downloadSection = downlaodsSoup.find("table", "uploads")
|
||||
if "No downloads were found." not in downlaodsSoup.get_text():
|
||||
downloadRows = downloadSection.find_all("tr")[1:]
|
||||
for downloadRow in downloadRows:
|
||||
cols = downloadRow.find_all("td")
|
||||
downloadTD = cols[0]
|
||||
downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text
|
||||
fileName = downloadTD.a.text
|
||||
release = Release()
|
||||
release.fileURL = downloadURL
|
||||
release.fileName = fileName
|
||||
project.releases.append(release)
|
||||
|
||||
# get issues
|
||||
project.issues = []
|
||||
issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser")
|
||||
if "No issues were found." not in issuesSoup.get_text():
|
||||
issuesSection = issuesSoup.find("table", "recent-issues")
|
||||
for issueRow in issuesSection.find_all("tr")[1:]:
|
||||
issue = Issue()
|
||||
cols = issueRow.find_all("td")
|
||||
issueId = cols[0].text
|
||||
issueURL = projectURL + "issues/" + issueId + "/"
|
||||
issueStatus = cols[2].text
|
||||
issueSummary = cols[1].text
|
||||
issueTitle = cols[1].find("a").text
|
||||
issueAuthor = cols[3].text
|
||||
issue.author = issueAuthor
|
||||
issue.comments = []
|
||||
issue.status = issueStatus
|
||||
issue.summary = issueSummary
|
||||
issue.title = issueTitle
|
||||
issue.id = issueId
|
||||
# we must go deeper to get comments
|
||||
issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
|
||||
for comment in issueComments.find_all("div", "issue-comment"):
|
||||
author = comment.find("p").get_text().split("by")[1].split(",")[0]
|
||||
date = comment.find("span").get_text()
|
||||
commentText = comment.find("pre").get_text()
|
||||
issueComment = IssueComment()
|
||||
issueComment.date = date
|
||||
issueComment.author = author
|
||||
issueComment.summary = commentText
|
||||
issue.comments.append(issueComment)
|
||||
|
||||
project.issues.append(issue)
|
||||
|
||||
# get wiki pages
|
||||
project.wikis = []
|
||||
wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser")
|
||||
if "No documentation pages were found." not in wikiSoup.get_text():
|
||||
wikiSection = wikiSoup.find("table", "recent-issues")
|
||||
for wikiRow in wikiSection.find_all("tr")[1:]:
|
||||
wiki = Wiki()
|
||||
cols = wikiRow.find_all("td")
|
||||
wiki.pageName = cols[0].text
|
||||
wiki.summary = cols[1].text
|
||||
wiki.updated = cols[2].text
|
||||
wikiURL = projectURL + "page/" + wiki.pageName + "/"
|
||||
wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
|
||||
wikiContent = wikiPageSoup.find(id="wiki-content")
|
||||
wiki.htmlContent = wikiContent.prettify()
|
||||
wiki.textContent = wikiContent.get_text()
|
||||
project.wikis.append(wiki)
|
||||
|
||||
|
||||
projects.append(project)
|
||||
|
||||
return projects
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user