From 88d65df2bb567eea5fdd6e7b9f9b7375672cfb4d Mon Sep 17 00:00:00 2001 From: Nathan Adams Date: Tue, 8 Sep 2015 21:56:08 -0500 Subject: [PATCH] initial commit --- Issue.py | 36 +++++++++++++ Project.py | 26 +++++++++ README.txt | 52 ++++++++++++++++++ Release.py | 18 +++++++ Service.py | 41 ++++++++++++++ Wiki.py | 22 ++++++++ enum.py | 7 +++ main.py | 8 +++ services/__init__.py | 1 + services/googlecode.py | 113 ++++++++++++++++++++++++++++++++++++++ services/srchub.py | 119 +++++++++++++++++++++++++++++++++++++++++ 11 files changed, 443 insertions(+) create mode 100644 Issue.py create mode 100644 Project.py create mode 100644 README.txt create mode 100644 Release.py create mode 100644 Service.py create mode 100644 Wiki.py create mode 100644 enum.py create mode 100644 main.py create mode 100644 services/__init__.py create mode 100644 services/googlecode.py create mode 100644 services/srchub.py diff --git a/Issue.py b/Issue.py new file mode 100644 index 0000000..f7bcb85 --- /dev/null +++ b/Issue.py @@ -0,0 +1,36 @@ +class IssueComment(object): + + + author = "" + title = "" + summary = "" + date = "" + + def getAuthor(self): + return self.author + + def getTitle(self): + return self.title + + def getSummary(self): + return self.summary + + def getDate(self): + return self.date + + + +class Issue(IssueComment): + + status = "" + comments = [] + id = -1 + + def getId(self): + return self.id + + def getStatus(self): + return self.status + + def getCommnets(self): + return self.comments \ No newline at end of file diff --git a/Project.py b/Project.py new file mode 100644 index 0000000..c2ad7da --- /dev/null +++ b/Project.py @@ -0,0 +1,26 @@ +from enum import enum + +REPO_TYPES = enum("SVN", "git", "hg", "NA") + +class Project(object): + + repoURL = "" + releases = [] + issues = [] + wikis = [] + repoType = REPO_TYPES.NA + + def getRepoURL(self): + return self.repoURL + + def getReleases(self): + return self.releases + + def getIssues(self): + return self.issues + + def getRepoType(self): + return self.repoType + + def getWikis(self): + return self.wikis \ No newline at end of file diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..492879a --- /dev/null +++ b/README.txt @@ -0,0 +1,52 @@ +# codescrape + +Version 1.0 + +By: Nathan Adams + +License: MIT + +## Description + +This library is to be used to archive project data. Since with the announcement of Google Code going to archive only - I wanted to create a library where you can grab source data before it is gone forever. + +Use cases include: + +Archive projects due to: + +- Hosting service shutting down +- Authorities sending cease-and-desist against provider/project +- Historical/research/ or educational purposes + +## Usage + +Currently srchub and google code are supported. To use: + + from services.srchub import srchub + shub = srchub() + projects = shub.getProjects() + +or for google code + + from services.googlecode import googlecode + gcode = googlecode() + project = gcode.getProject("android-python27") + +Sourcehub library will pull all public projects since this list is easily accessed. Google Code does not have a public list persay. And I didn't want to scrape the search results, so I developed it to require you to pass in the project name. If you were to get your hands on a list of google code projects you could easily loop through them: + + from services.googlecode import googlecode + gcode = googlecode() + for project in someProjectList: + project = gcode.getProject(project) + # do something with project + +the project data structure is as follows: + +project + +- getRepoURL() -> Returns the URL of the repo +- getRepoType() -> Returns the type of repo (git, hg, or SVN) +- getReleases() -> Returns all downloads related to the project +- getIssues() -> Returns open issues +- getWikis() -> Returns wikis + diff --git a/Release.py b/Release.py new file mode 100644 index 0000000..ecdacc6 --- /dev/null +++ b/Release.py @@ -0,0 +1,18 @@ +class Release(object): + + fileName = "" + summary = "" + fileURL = "" + checksum = None + + def getFileName(self): + return self.fileName + + def getSummary(self): + return self.summary + + def getFileURL(self): + return self.fileURL + + def getChecksum(self): + return self.checksum \ No newline at end of file diff --git a/Service.py b/Service.py new file mode 100644 index 0000000..d079db8 --- /dev/null +++ b/Service.py @@ -0,0 +1,41 @@ +import pycurl +try: + from cStringIO import StringIO +except ImportError: + try: + from StringIO import StringIO + except ImportError: + from io import StringIO +from urllib import urlencode + +class Service(object): + + def getProjects(self): + pass + + def curl_post(self, url, postvals, header = []): + buffer = StringIO() + cobj = pycurl.Curl() + cobj.setopt(pycurl.URL, url) + cobj.setopt(pycurl.SSL_VERIFYPEER, 0) + cobj.setopt(pycurl.SSL_VERIFYHOST, 0) + cobj.setopt(pycurl.POST, 1) + cobj.setopt(pycurl.WRITEDATA, buffer) + postdata = urlencode(postvals) + cobj.setopt(pycurl.POSTFIELDS, postdata) + cobj.setopt(pycurl.HTTPHEADER, header) + cobj.perform() + cobj.close() + return buffer + + def curl_get(self, url, header = []): + buffer = StringIO() + cobj = pycurl.Curl() + cobj.setopt(pycurl.SSL_VERIFYPEER, 0) + cobj.setopt(pycurl.SSL_VERIFYHOST, 0) + cobj.setopt(pycurl.URL, url) + cobj.setopt(pycurl.WRITEDATA, buffer) + cobj.setopt(pycurl.HTTPHEADER, header) + cobj.perform() + cobj.close() + return buffer \ No newline at end of file diff --git a/Wiki.py b/Wiki.py new file mode 100644 index 0000000..ef64928 --- /dev/null +++ b/Wiki.py @@ -0,0 +1,22 @@ +class Wiki(object): + + pageName = "" + htmlContent = "" + textContent = "" + summary = "" + updated = "" + + def getPageName(self): + return self.pageName + + def getHTMLCotnent(self): + return self.htmlContent + + def getTextContent(self): + return self.textContent + + def getSummary(self): + return self.summary + + def getUpdated(self): + return self.updated \ No newline at end of file diff --git a/enum.py b/enum.py new file mode 100644 index 0000000..0eaf725 --- /dev/null +++ b/enum.py @@ -0,0 +1,7 @@ +# Pythonic way to do enums: +# http://stackoverflow.com/a/1695250/195722 +def enum(*sequential, **named): + enums = dict(zip(sequential, range(len(sequential))), **named) + reverse = dict((value, key) for key, value in enums.iteritems()) + enums['val'] = reverse + return type('Enum', (), enums) \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..9a51622 --- /dev/null +++ b/main.py @@ -0,0 +1,8 @@ +from services.srchub import srchub +from services.googlecode import googlecode + +#shub = srchub() + +#projects = shub.getProjects() +gcode = googlecode() +project = gcode.getProject("android-python27") diff --git a/services/__init__.py b/services/__init__.py new file mode 100644 index 0000000..ee0f945 --- /dev/null +++ b/services/__init__.py @@ -0,0 +1 @@ +__author__ = 'nathan' diff --git a/services/googlecode.py b/services/googlecode.py new file mode 100644 index 0000000..6383e46 --- /dev/null +++ b/services/googlecode.py @@ -0,0 +1,113 @@ +from Service import Service +from Project import REPO_TYPES, Project +from Release import Release +from Issue import IssueComment, Issue +from Wiki import Wiki +from bs4 import BeautifulSoup + + +class googlecode(Service): + + DOMAIN = "https://code.google.com" + + # Since I want to stay on Google's good side + # I'm going to write this method to parse a single project + # You will need to provide your own project list to roll through + # Such a list exists (although incomplete) + # http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2 + def getProject(self, projectName): + project = Project() + sourceType = None + projectURL = self.DOMAIN + "/p/" + projectName + "/" + + projectpageHTML = self.curl_get(projectURL).getvalue() + projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") + + sourceURL = projectpageSoup.find(name="a", string="Source").get("href") + sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser") + sourceSoupText = sourceSoup.get_text() + + # get source + if "git clone" in sourceSoupText: + project.repoType = REPO_TYPES.git + project.repoURL = "https://code.google.com/p/" + projectName + "/" + elif "svn co" in sourceSoupText: + project.repoType = REPO_TYPES.SVN + project.repoURL = "http://" + projectName + ".googlecode.com/svn/" + else: + project.repoType = REPO_TYPES.hg + project.repoURL = "https://code.google.com/p/" + projectName + "/" + + + # get downloads + project.releases = [] + downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser") + downloadSection = downlaodsSoup.find("table", "results") + if "Your search did not generate any results." not in downlaodsSoup.get_text(): + downloadRows = downloadSection.find_all("tr")[1:] + for downloadRow in downloadRows: + cols = downloadRow.find_all("td") + downloadTD = cols[1] + downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ") + fileName = downloadTD.a.text.replace("\n", "").strip(" ") + release = Release() + release.fileURL = downloadURL + release.fileName = fileName + project.releases.append(release) + + # get issues + project.issues = [] + issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser") + if "Your search did not generate any results." not in issuesSoup.get_text(): + issuesSection = issuesSoup.find("table", "results") + for issueRow in issuesSection.find_all("tr")[1:]: + issue = Issue() + cols = issueRow.find_all("td") + issueId = cols[1].text.replace("\n", "").strip() + issueURL = projectURL + "issues/detail?id=" + issueId + issueStatus = cols[3].text.replace("\n", "").strip(" ") + issueSummary = cols[8].text.replace("\n", "") + issueTitle = cols[8].text.replace("\n", "") + issueAuthor = cols[5].text.replace("\n", "") + + #issue.author = issueAuthor + issue.comments = [] + issue.status = issueStatus.strip(" ") + issue.summary = issueSummary.strip(" ") + issue.title = issueTitle + issue.id = issueId + + # we must go deeper to get comments + issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") + for comment in issueComments.find_all("div", "vt"): + #author = comment.find(class_="author").find("a").text + author = (comment.find(class_="author").find_all("a")[-1]).contents + date = comment.find("span", "date")["title"] + commentText = comment.find("pre").get_text() + issueComment = IssueComment() + issueComment.date = date + issueComment.author = author + issueComment.summary = commentText + issue.comments.append(issueComment) + + project.issues.append(issue) + + # get wiki pages + project.wikis = [] + wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser") + if "Your search did not generate any results." not in wikiSoup.get_text(): + wikiSection = wikiSoup.find("table", "results") + for wikiRow in wikiSection.find_all("tr")[1:]: + wiki = Wiki() + cols = wikiRow.find_all("td") + wiki.pageName = cols[1].text.replace("\n", "").strip(" ") + wiki.summary = cols[2].text.replace("\n", "").strip(" ") + wiki.updated = cols[3].text.replace("\n", "").strip(" ") + wikiURL = projectURL + "wiki/" + wiki.pageName + wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") + wikiContent = wikiPageSoup.find(id="wikicontent") + wiki.htmlContent = wikiContent.prettify() + wiki.textContent = wikiContent.get_text() + project.wikis.append(wiki) + + return project \ No newline at end of file diff --git a/services/srchub.py b/services/srchub.py new file mode 100644 index 0000000..0929a3e --- /dev/null +++ b/services/srchub.py @@ -0,0 +1,119 @@ +from Service import Service +from Project import REPO_TYPES, Project +from Release import Release +from Issue import IssueComment, Issue +from Wiki import Wiki +from bs4 import BeautifulSoup + +class srchub(Service): + + DOMAIN = "https://beta.datanethost.net" + + def getProjects(self): + # Perhaps I should provide more API endpoints to make scraping easier... + projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue() + soup = BeautifulSoup(projectlist, "html.parser") + links = soup.find("ul", "prjlistclass") + projects = [] + for link in links.find_all("a"): + project = Project() + sourceType = None + projectURL = self.DOMAIN + link.get("href") + projectName = projectURL.split("/")[-2] + + projectpageHTML = self.curl_get(projectURL).getvalue() + projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser") + + sourceURL = projectpageSoup.find(name="a", string="Source").get("href") + sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser") + sourceSoupText = sourceSoup.get_text() + + # get source + if "git clone" in sourceSoupText: + project.repoType = REPO_TYPES.git + project.repoURL = "git://" + self.DOMAIN + "/" + projectName + ".git" + elif "svn co" in sourceSoupText: + project.repoType = REPO_TYPES.SVN + project.repoURL = "https://" + self.DOMAIN + "/svn/" + projectName + "/" + else: + project.repoType = REPO_TYPES.hg + project.repoURL = "https://" + self.DOMAIN + "/hg/" + projectName + "/" + + + # get downloads + project.releases = [] + downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser") + downloadSection = downlaodsSoup.find("table", "uploads") + if "No downloads were found." not in downlaodsSoup.get_text(): + downloadRows = downloadSection.find_all("tr")[1:] + for downloadRow in downloadRows: + cols = downloadRow.find_all("td") + downloadTD = cols[0] + downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text + fileName = downloadTD.a.text + release = Release() + release.fileURL = downloadURL + release.fileName = fileName + project.releases.append(release) + + # get issues + project.issues = [] + issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser") + if "No issues were found." not in issuesSoup.get_text(): + issuesSection = issuesSoup.find("table", "recent-issues") + for issueRow in issuesSection.find_all("tr")[1:]: + issue = Issue() + cols = issueRow.find_all("td") + issueId = cols[0].text + issueURL = projectURL + "issues/" + issueId + "/" + issueStatus = cols[2].text + issueSummary = cols[1].text + issueTitle = cols[1].find("a").text + issueAuthor = cols[3].text + issue.author = issueAuthor + issue.comments = [] + issue.status = issueStatus + issue.summary = issueSummary + issue.title = issueTitle + issue.id = issueId + # we must go deeper to get comments + issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser") + for comment in issueComments.find_all("div", "issue-comment"): + author = comment.find("p").get_text().split("by")[1].split(",")[0] + date = comment.find("span").get_text() + commentText = comment.find("pre").get_text() + issueComment = IssueComment() + issueComment.date = date + issueComment.author = author + issueComment.summary = commentText + issue.comments.append(issueComment) + + project.issues.append(issue) + + # get wiki pages + project.wikis = [] + wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser") + if "No documentation pages were found." not in wikiSoup.get_text(): + wikiSection = wikiSoup.find("table", "recent-issues") + for wikiRow in wikiSection.find_all("tr")[1:]: + wiki = Wiki() + cols = wikiRow.find_all("td") + wiki.pageName = cols[0].text + wiki.summary = cols[1].text + wiki.updated = cols[2].text + wikiURL = projectURL + "page/" + wiki.pageName + "/" + wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser") + wikiContent = wikiPageSoup.find(id="wiki-content") + wiki.htmlContent = wikiContent.prettify() + wiki.textContent = wikiContent.get_text() + project.wikis.append(wiki) + + + projects.append(project) + + return projects + + + + +