initial commit

2015-09-08 21:56:08 -05:00
commit 88d65df2bb
11 changed files with 443 additions and 0 deletions
--- a/Issue.py
+++ b/Issue.py
@@ -0,0 +1,36 @@
+class IssueComment(object):
+
+
+    author = ""
+    title = ""
+    summary = ""
+    date = ""
+
+    def getAuthor(self):
+        return self.author
+
+    def getTitle(self):
+        return self.title
+
+    def getSummary(self):
+        return self.summary
+
+    def getDate(self):
+        return self.date
+
+
+
+class Issue(IssueComment):
+
+    status = ""
+    comments = []
+    id = -1
+
+    def getId(self):
+        return self.id
+
+    def getStatus(self):
+        return self.status
+
+    def getCommnets(self):
+        return self.comments
--- a/Project.py
+++ b/Project.py
@@ -0,0 +1,26 @@
+from enum import enum
+
+REPO_TYPES = enum("SVN", "git", "hg", "NA")
+
+class Project(object):
+
+    repoURL = ""
+    releases = []
+    issues = []
+    wikis = []
+    repoType = REPO_TYPES.NA
+
+    def getRepoURL(self):
+        return self.repoURL
+
+    def getReleases(self):
+        return self.releases
+
+    def getIssues(self):
+        return self.issues
+
+    def getRepoType(self):
+        return self.repoType
+
+    def getWikis(self):
+        return self.wikis
--- a/README.txt
+++ b/README.txt
@@ -0,0 +1,52 @@
+# codescrape
+
+Version 1.0
+
+By: Nathan Adams
+
+License: MIT
+
+## Description
+
+This library is to be used to archive project data. Since with the announcement of Google Code going to archive only - I wanted to create a library where you can grab source data before it is gone forever.
+
+Use cases include:
+
+Archive projects due to:
+
+- Hosting service shutting down
+- Authorities sending cease-and-desist against provider/project
+- Historical/research/ or educational purposes
+
+## Usage
+
+Currently srchub and google code are supported. To use:
+
+    from services.srchub import srchub
+	shub = srchub()
+	projects = shub.getProjects()
+	
+or for google code
+
+	from services.googlecode import googlecode
+	gcode = googlecode()
+	project = gcode.getProject("android-python27")
+	
+Sourcehub library will pull all public projects since this list is easily accessed. Google Code does not have a public list persay. And I didn't want to scrape the search results, so I developed it to require you to pass in the project name. If you were to get your hands on a list of google code projects you could easily loop through them:
+
+	from services.googlecode import googlecode
+	gcode = googlecode()
+	for project in someProjectList:
+		project = gcode.getProject(project)
+		# do something with project
+		
+the project data structure is as follows:
+
+project
+
+- getRepoURL() -> Returns the URL of the repo
+- getRepoType() -> Returns the type of repo (git, hg, or SVN)
+- getReleases() -> Returns all downloads related to the project
+- getIssues() -> Returns open issues
+- getWikis() -> Returns wikis
+
--- a/Release.py
+++ b/Release.py
@@ -0,0 +1,18 @@
+class Release(object):
+
+    fileName = ""
+    summary = ""
+    fileURL = ""
+    checksum = None
+
+    def getFileName(self):
+        return self.fileName
+
+    def getSummary(self):
+        return self.summary
+
+    def getFileURL(self):
+        return self.fileURL
+
+    def getChecksum(self):
+        return self.checksum
--- a/Service.py
+++ b/Service.py
@@ -0,0 +1,41 @@
+import pycurl
+try:
+    from cStringIO import StringIO
+except ImportError:
+    try:
+        from StringIO import StringIO
+    except ImportError:
+        from io import StringIO
+from urllib import urlencode
+
+class Service(object):
+
+    def getProjects(self):
+        pass
+
+    def curl_post(self, url, postvals, header = []):
+        buffer = StringIO()
+        cobj = pycurl.Curl()
+        cobj.setopt(pycurl.URL, url)
+        cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
+        cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
+        cobj.setopt(pycurl.POST, 1)
+        cobj.setopt(pycurl.WRITEDATA, buffer)
+        postdata = urlencode(postvals)
+        cobj.setopt(pycurl.POSTFIELDS, postdata)
+        cobj.setopt(pycurl.HTTPHEADER, header)
+        cobj.perform()
+        cobj.close()
+        return buffer
+
+    def curl_get(self, url, header = []):
+        buffer = StringIO()
+        cobj = pycurl.Curl()
+        cobj.setopt(pycurl.SSL_VERIFYPEER, 0)
+        cobj.setopt(pycurl.SSL_VERIFYHOST, 0)
+        cobj.setopt(pycurl.URL, url)
+        cobj.setopt(pycurl.WRITEDATA, buffer)
+        cobj.setopt(pycurl.HTTPHEADER, header)
+        cobj.perform()
+        cobj.close()
+        return buffer
--- a/Wiki.py
+++ b/Wiki.py
@@ -0,0 +1,22 @@
+class Wiki(object):
+
+    pageName = ""
+    htmlContent = ""
+    textContent = ""
+    summary = ""
+    updated = ""
+
+    def getPageName(self):
+        return self.pageName
+
+    def getHTMLCotnent(self):
+        return self.htmlContent
+
+    def getTextContent(self):
+        return self.textContent
+
+    def getSummary(self):
+        return self.summary
+
+    def getUpdated(self):
+        return self.updated
--- a/enum.py
+++ b/enum.py
@@ -0,0 +1,7 @@
+# Pythonic way to do enums:
+# http://stackoverflow.com/a/1695250/195722
+def enum(*sequential, **named):
+    enums = dict(zip(sequential, range(len(sequential))), **named)
+    reverse = dict((value, key) for key, value in enums.iteritems())
+    enums['val'] = reverse
+    return type('Enum', (), enums)
--- a/main.py
+++ b/main.py
@@ -0,0 +1,8 @@
+from services.srchub import srchub
+from services.googlecode import googlecode
+
+#shub = srchub()
+
+#projects = shub.getProjects()
+gcode = googlecode()
+project = gcode.getProject("android-python27")
--- a/services/init.py
+++ b/services/init.py
@@ -0,0 +1 @@
+__author__ = 'nathan'
--- a/services/googlecode.py
+++ b/services/googlecode.py
@@ -0,0 +1,113 @@
+from Service import Service
+from Project import REPO_TYPES, Project
+from Release import Release
+from Issue import IssueComment, Issue
+from Wiki import Wiki
+from bs4 import BeautifulSoup
+
+
+class googlecode(Service):
+
+    DOMAIN = "https://code.google.com"
+
+    # Since I want to stay on Google's good side
+    # I'm going to write this method to parse a single project
+    # You will need to provide your own project list to roll through
+    # Such a list exists (although incomplete)
+    # http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2
+    def getProject(self, projectName):
+        project = Project()
+        sourceType = None
+        projectURL = self.DOMAIN + "/p/" + projectName + "/"
+
+        projectpageHTML = self.curl_get(projectURL).getvalue()
+        projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
+
+        sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
+        sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")
+        sourceSoupText = sourceSoup.get_text()
+
+        # get source
+        if "git clone" in sourceSoupText:
+            project.repoType = REPO_TYPES.git
+            project.repoURL = "https://code.google.com/p/" + projectName + "/"
+        elif "svn co" in sourceSoupText:
+            project.repoType = REPO_TYPES.SVN
+            project.repoURL = "http://" + projectName + ".googlecode.com/svn/"
+        else:
+            project.repoType = REPO_TYPES.hg
+            project.repoURL = "https://code.google.com/p/" + projectName + "/"
+
+
+        # get downloads
+        project.releases = []
+        downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")
+        downloadSection = downlaodsSoup.find("table", "results")
+        if "Your search did not generate any results." not in downlaodsSoup.get_text():
+            downloadRows = downloadSection.find_all("tr")[1:]
+            for downloadRow in downloadRows:
+                cols = downloadRow.find_all("td")
+                downloadTD = cols[1]
+                downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")
+                fileName = downloadTD.a.text.replace("\n", "").strip(" ")
+                release = Release()
+                release.fileURL = downloadURL
+                release.fileName = fileName
+                project.releases.append(release)
+
+        # get issues
+        project.issues = []
+        issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")
+        if "Your search did not generate any results." not in issuesSoup.get_text():
+            issuesSection = issuesSoup.find("table", "results")
+            for issueRow in issuesSection.find_all("tr")[1:]:
+                issue = Issue()
+                cols = issueRow.find_all("td")
+                issueId = cols[1].text.replace("\n", "").strip()
+                issueURL = projectURL + "issues/detail?id=" + issueId
+                issueStatus = cols[3].text.replace("\n", "").strip(" ")
+                issueSummary = cols[8].text.replace("\n", "")
+                issueTitle = cols[8].text.replace("\n", "")
+                issueAuthor = cols[5].text.replace("\n", "")
+
+                #issue.author = issueAuthor
+                issue.comments = []
+                issue.status = issueStatus.strip(" ")
+                issue.summary = issueSummary.strip(" ")
+                issue.title = issueTitle
+                issue.id = issueId
+
+                # we must go deeper to get comments
+                issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
+                for comment in issueComments.find_all("div", "vt"):
+                    #author = comment.find(class_="author").find("a").text
+                    author = (comment.find(class_="author").find_all("a")[-1]).contents
+                    date = comment.find("span", "date")["title"]
+                    commentText = comment.find("pre").get_text()
+                    issueComment = IssueComment()
+                    issueComment.date = date
+                    issueComment.author = author
+                    issueComment.summary = commentText
+                    issue.comments.append(issueComment)
+
+                project.issues.append(issue)
+
+        # get wiki pages
+        project.wikis = []
+        wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")
+        if "Your search did not generate any results." not in wikiSoup.get_text():
+            wikiSection = wikiSoup.find("table", "results")
+            for wikiRow in wikiSection.find_all("tr")[1:]:
+                wiki = Wiki()
+                cols = wikiRow.find_all("td")
+                wiki.pageName = cols[1].text.replace("\n", "").strip(" ")
+                wiki.summary = cols[2].text.replace("\n", "").strip(" ")
+                wiki.updated = cols[3].text.replace("\n", "").strip(" ")
+                wikiURL = projectURL + "wiki/" + wiki.pageName
+                wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
+                wikiContent = wikiPageSoup.find(id="wikicontent")
+                wiki.htmlContent = wikiContent.prettify()
+                wiki.textContent = wikiContent.get_text()
+                project.wikis.append(wiki)
+
+        return project
--- a/services/srchub.py
+++ b/services/srchub.py
@@ -0,0 +1,119 @@
+from Service import Service
+from Project import REPO_TYPES, Project
+from Release import Release
+from Issue import IssueComment, Issue
+from Wiki import Wiki
+from bs4 import BeautifulSoup
+
+class srchub(Service):
+
+    DOMAIN = "https://beta.datanethost.net"
+
+    def getProjects(self):
+        # Perhaps I should provide more API endpoints to make scraping easier...
+        projectlist = self.curl_get(self.DOMAIN + "/projects/").getvalue()
+        soup = BeautifulSoup(projectlist, "html.parser")
+        links = soup.find("ul", "prjlistclass")
+        projects = []
+        for link in links.find_all("a"):
+            project = Project()
+            sourceType = None
+            projectURL = self.DOMAIN + link.get("href")
+            projectName = projectURL.split("/")[-2]
+
+            projectpageHTML = self.curl_get(projectURL).getvalue()
+            projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
+
+            sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
+            sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + sourceURL).getvalue(), "html.parser")
+            sourceSoupText = sourceSoup.get_text()
+
+            # get source
+            if "git clone" in sourceSoupText:
+                project.repoType = REPO_TYPES.git
+                project.repoURL = "git://" + self.DOMAIN + "/" + projectName + ".git"
+            elif "svn co" in sourceSoupText:
+                project.repoType = REPO_TYPES.SVN
+                project.repoURL = "https://" + self.DOMAIN + "/svn/" + projectName + "/"
+            else:
+                project.repoType = REPO_TYPES.hg
+                project.repoURL = "https://" + self.DOMAIN + "/hg/" + projectName + "/"
+
+
+            # get downloads
+            project.releases = []
+            downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/").getvalue(), "html.parser")
+            downloadSection = downlaodsSoup.find("table", "uploads")
+            if "No downloads were found." not in downlaodsSoup.get_text():
+                downloadRows = downloadSection.find_all("tr")[1:]
+                for downloadRow in downloadRows:
+                    cols = downloadRow.find_all("td")
+                    downloadTD = cols[0]
+                    downloadURL = self.DOMAIN + "/p/" + projectName + "/downloads/get/" + downloadTD.a.text
+                    fileName = downloadTD.a.text
+                    release = Release()
+                    release.fileURL = downloadURL
+                    release.fileName = fileName
+                    project.releases.append(release)
+
+            # get issues
+            project.issues = []
+            issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/").getvalue(), "html.parser")
+            if "No issues were found." not in issuesSoup.get_text():
+                issuesSection = issuesSoup.find("table", "recent-issues")
+                for issueRow in issuesSection.find_all("tr")[1:]:
+                    issue = Issue()
+                    cols = issueRow.find_all("td")
+                    issueId = cols[0].text
+                    issueURL = projectURL + "issues/" + issueId + "/"
+                    issueStatus = cols[2].text
+                    issueSummary = cols[1].text
+                    issueTitle = cols[1].find("a").text
+                    issueAuthor = cols[3].text
+                    issue.author = issueAuthor
+                    issue.comments = []
+                    issue.status = issueStatus
+                    issue.summary = issueSummary
+                    issue.title = issueTitle
+                    issue.id = issueId
+                    # we must go deeper to get comments
+                    issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
+                    for comment in issueComments.find_all("div", "issue-comment"):
+                        author = comment.find("p").get_text().split("by")[1].split(",")[0]
+                        date = comment.find("span").get_text()
+                        commentText = comment.find("pre").get_text()
+                        issueComment = IssueComment()
+                        issueComment.date = date
+                        issueComment.author = author
+                        issueComment.summary = commentText
+                        issue.comments.append(issueComment)
+
+                    project.issues.append(issue)
+
+            # get wiki pages
+            project.wikis = []
+            wikiSoup = BeautifulSoup(self.curl_get(projectURL + "doc/").getvalue(), "html.parser")
+            if "No documentation pages were found." not in wikiSoup.get_text():
+                wikiSection = wikiSoup.find("table", "recent-issues")
+                for wikiRow in wikiSection.find_all("tr")[1:]:
+                    wiki = Wiki()
+                    cols = wikiRow.find_all("td")
+                    wiki.pageName = cols[0].text
+                    wiki.summary = cols[1].text
+                    wiki.updated = cols[2].text
+                    wikiURL = projectURL + "page/" + wiki.pageName + "/"
+                    wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
+                    wikiContent = wikiPageSoup.find(id="wiki-content")
+                    wiki.htmlContent = wikiContent.prettify()
+                    wiki.textContent = wikiContent.get_text()
+                    project.wikis.append(wiki)
+
+
+            projects.append(project)
+
+        return projects
+
+
+
+
+