codescrape/services/googlecode.py

from Service import Service
from Project import REPO_TYPES, Project
from Release import Release
from Issue import IssueComment, Issue
from Wiki import Wiki
from bs4 import BeautifulSoup


class googlecode(Service):

    DOMAIN = "https://code.google.com"

    # Since I want to stay on Google's good side
    # I'm going to write this method to parse a single project
    # You will need to provide your own project list to roll through
    # Such a list exists (although incomplete)
    # http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2
    def getProject(self, projectName):
        project = Project()
        sourceType = None
        projectURL = self.DOMAIN + "/p/" + projectName + "/"

        projectpageHTML = self.curl_get(projectURL).getvalue()
        projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")

        sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
        sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")
        sourceSoupText = sourceSoup.get_text()

        # get source
        if "git clone" in sourceSoupText:
            project.repoType = REPO_TYPES.git
            project.repoURL = "https://code.google.com/p/" + projectName + "/"
        elif "svn co" in sourceSoupText:
            project.repoType = REPO_TYPES.SVN
            project.repoURL = "http://" + projectName + ".googlecode.com/svn/"
        else:
            project.repoType = REPO_TYPES.hg
            project.repoURL = "https://code.google.com/p/" + projectName + "/"


        # get downloads
        project.releases = []
        downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")
        downloadSection = downlaodsSoup.find("table", "results")
        if "Your search did not generate any results." not in downlaodsSoup.get_text():
            downloadRows = downloadSection.find_all("tr")[1:]
            for downloadRow in downloadRows:
                cols = downloadRow.find_all("td")
                downloadTD = cols[1]
                downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")
                fileName = downloadTD.a.text.replace("\n", "").strip(" ")
                release = Release()
                release.fileURL = downloadURL
                release.fileName = fileName
                project.releases.append(release)

        # get issues
        project.issues = []
        issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")
        if "Your search did not generate any results." not in issuesSoup.get_text():
            issuesSection = issuesSoup.find("table", "results")
            for issueRow in issuesSection.find_all("tr")[1:]:
                issue = Issue()
                cols = issueRow.find_all("td")
                issueId = cols[1].text.replace("\n", "").strip()
                issueURL = projectURL + "issues/detail?id=" + issueId
                issueStatus = cols[3].text.replace("\n", "").strip(" ")
                issueSummary = cols[8].text.replace("\n", "")
                issueTitle = cols[8].text.replace("\n", "")
                issueAuthor = cols[5].text.replace("\n", "")

                #issue.author = issueAuthor
                issue.comments = []
                issue.status = issueStatus.strip(" ")
                issue.summary = issueSummary.strip(" ")
                issue.title = issueTitle
                issue.id = issueId

                # we must go deeper to get comments
                issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
                for comment in issueComments.find_all("div", "vt"):
                    #author = comment.find(class_="author").find("a").text
                    author = (comment.find(class_="author").find_all("a")[-1]).contents
                    date = comment.find("span", "date")["title"]
                    commentText = comment.find("pre").get_text()
                    issueComment = IssueComment()
                    issueComment.date = date
                    issueComment.author = author
                    issueComment.summary = commentText
                    issue.comments.append(issueComment)

                project.issues.append(issue)

        # get wiki pages
        project.wikis = []
        wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")
        if "Your search did not generate any results." not in wikiSoup.get_text():
            wikiSection = wikiSoup.find("table", "results")
            for wikiRow in wikiSection.find_all("tr")[1:]:
                wiki = Wiki()
                cols = wikiRow.find_all("td")
                wiki.pageName = cols[1].text.replace("\n", "").strip(" ")
                wiki.summary = cols[2].text.replace("\n", "").strip(" ")
                wiki.updated = cols[3].text.replace("\n", "").strip(" ")
                wikiURL = projectURL + "wiki/" + wiki.pageName
                wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
                wikiContent = wikiPageSoup.find(id="wikicontent")
                wiki.htmlContent = wikiContent.prettify()
                wiki.textContent = wikiContent.get_text()
                project.wikis.append(wiki)

        return project
initial commit 2015-09-08 21:56:08 -05:00			`from Service import Service`
			`from Project import REPO_TYPES, Project`
			`from Release import Release`
			`from Issue import IssueComment, Issue`
			`from Wiki import Wiki`
			`from bs4 import BeautifulSoup`


			`class googlecode(Service):`

			`DOMAIN = "https://code.google.com"`

			`# Since I want to stay on Google's good side`
			`# I'm going to write this method to parse a single project`
			`# You will need to provide your own project list to roll through`
			`# Such a list exists (although incomplete)`
			`# http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2`
			`def getProject(self, projectName):`
			`project = Project()`
			`sourceType = None`
			`projectURL = self.DOMAIN + "/p/" + projectName + "/"`

			`projectpageHTML = self.curl_get(projectURL).getvalue()`
			`projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")`

			`sourceURL = projectpageSoup.find(name="a", string="Source").get("href")`
			`sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")`
			`sourceSoupText = sourceSoup.get_text()`

			`# get source`
			`if "git clone" in sourceSoupText:`
			`project.repoType = REPO_TYPES.git`
			`project.repoURL = "https://code.google.com/p/" + projectName + "/"`
			`elif "svn co" in sourceSoupText:`
			`project.repoType = REPO_TYPES.SVN`
			`project.repoURL = "http://" + projectName + ".googlecode.com/svn/"`
			`else:`
			`project.repoType = REPO_TYPES.hg`
			`project.repoURL = "https://code.google.com/p/" + projectName + "/"`


			`# get downloads`
			`project.releases = []`
			`downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")`
			`downloadSection = downlaodsSoup.find("table", "results")`
			`if "Your search did not generate any results." not in downlaodsSoup.get_text():`
			`downloadRows = downloadSection.find_all("tr")[1:]`
			`for downloadRow in downloadRows:`
			`cols = downloadRow.find_all("td")`
			`downloadTD = cols[1]`
			`downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")`
			`fileName = downloadTD.a.text.replace("\n", "").strip(" ")`
			`release = Release()`
			`release.fileURL = downloadURL`
			`release.fileName = fileName`
			`project.releases.append(release)`

			`# get issues`
			`project.issues = []`
			`issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")`
			`if "Your search did not generate any results." not in issuesSoup.get_text():`
			`issuesSection = issuesSoup.find("table", "results")`
			`for issueRow in issuesSection.find_all("tr")[1:]:`
			`issue = Issue()`
			`cols = issueRow.find_all("td")`
			`issueId = cols[1].text.replace("\n", "").strip()`
			`issueURL = projectURL + "issues/detail?id=" + issueId`
			`issueStatus = cols[3].text.replace("\n", "").strip(" ")`
			`issueSummary = cols[8].text.replace("\n", "")`
			`issueTitle = cols[8].text.replace("\n", "")`
			`issueAuthor = cols[5].text.replace("\n", "")`

			`#issue.author = issueAuthor`
			`issue.comments = []`
			`issue.status = issueStatus.strip(" ")`
			`issue.summary = issueSummary.strip(" ")`
			`issue.title = issueTitle`
			`issue.id = issueId`

			`# we must go deeper to get comments`
			`issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")`
			`for comment in issueComments.find_all("div", "vt"):`
			`#author = comment.find(class_="author").find("a").text`
			`author = (comment.find(class_="author").find_all("a")[-1]).contents`
			`date = comment.find("span", "date")["title"]`
			`commentText = comment.find("pre").get_text()`
			`issueComment = IssueComment()`
			`issueComment.date = date`
			`issueComment.author = author`
			`issueComment.summary = commentText`
			`issue.comments.append(issueComment)`

			`project.issues.append(issue)`

			`# get wiki pages`
			`project.wikis = []`
			`wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")`
			`if "Your search did not generate any results." not in wikiSoup.get_text():`
			`wikiSection = wikiSoup.find("table", "results")`
			`for wikiRow in wikiSection.find_all("tr")[1:]:`
			`wiki = Wiki()`
			`cols = wikiRow.find_all("td")`
			`wiki.pageName = cols[1].text.replace("\n", "").strip(" ")`
			`wiki.summary = cols[2].text.replace("\n", "").strip(" ")`
			`wiki.updated = cols[3].text.replace("\n", "").strip(" ")`
			`wikiURL = projectURL + "wiki/" + wiki.pageName`
			`wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")`
			`wikiContent = wikiPageSoup.find(id="wikicontent")`
			`wiki.htmlContent = wikiContent.prettify()`
			`wiki.textContent = wikiContent.get_text()`
			`project.wikis.append(wiki)`

			`return project`