codescrape/services/googlecode.py

113 lines
5.3 KiB
Python
Raw Permalink Normal View History

2015-09-09 02:56:08 +00:00
from Service import Service
from Project import REPO_TYPES, Project
from Release import Release
from Issue import IssueComment, Issue
from Wiki import Wiki
from bs4 import BeautifulSoup
class googlecode(Service):
DOMAIN = "https://code.google.com"
# Since I want to stay on Google's good side
# I'm going to write this method to parse a single project
# You will need to provide your own project list to roll through
# Such a list exists (although incomplete)
# http://flossdata.syr.edu/data/gc/2012/2012-Nov/gcProjectInfo2012-Nov.txt.bz2
def getProject(self, projectName):
project = Project()
sourceType = None
projectURL = self.DOMAIN + "/p/" + projectName + "/"
projectpageHTML = self.curl_get(projectURL).getvalue()
projectpageSoup = BeautifulSoup(projectpageHTML, "html.parser")
sourceURL = projectpageSoup.find(name="a", string="Source").get("href")
sourceSoup = BeautifulSoup(self.curl_get(self.DOMAIN + "/p/" + sourceURL).getvalue(), "html.parser")
sourceSoupText = sourceSoup.get_text()
# get source
if "git clone" in sourceSoupText:
project.repoType = REPO_TYPES.git
project.repoURL = "https://code.google.com/p/" + projectName + "/"
elif "svn co" in sourceSoupText:
project.repoType = REPO_TYPES.SVN
project.repoURL = "http://" + projectName + ".googlecode.com/svn/"
else:
project.repoType = REPO_TYPES.hg
project.repoURL = "https://code.google.com/p/" + projectName + "/"
# get downloads
project.releases = []
downlaodsSoup = BeautifulSoup(self.curl_get(projectURL + "downloads/list").getvalue(), "html.parser")
downloadSection = downlaodsSoup.find("table", "results")
if "Your search did not generate any results." not in downlaodsSoup.get_text():
downloadRows = downloadSection.find_all("tr")[1:]
for downloadRow in downloadRows:
cols = downloadRow.find_all("td")
downloadTD = cols[1]
downloadURL = "https://" + projectName + ".googlecode.com/files/" + downloadTD.a.text.replace("\n", "").strip(" ")
fileName = downloadTD.a.text.replace("\n", "").strip(" ")
release = Release()
release.fileURL = downloadURL
release.fileName = fileName
project.releases.append(release)
# get issues
project.issues = []
issuesSoup = BeautifulSoup(self.curl_get(projectURL + "issues/list").getvalue(), "html.parser")
if "Your search did not generate any results." not in issuesSoup.get_text():
issuesSection = issuesSoup.find("table", "results")
for issueRow in issuesSection.find_all("tr")[1:]:
issue = Issue()
cols = issueRow.find_all("td")
issueId = cols[1].text.replace("\n", "").strip()
issueURL = projectURL + "issues/detail?id=" + issueId
issueStatus = cols[3].text.replace("\n", "").strip(" ")
issueSummary = cols[8].text.replace("\n", "")
issueTitle = cols[8].text.replace("\n", "")
issueAuthor = cols[5].text.replace("\n", "")
#issue.author = issueAuthor
issue.comments = []
issue.status = issueStatus.strip(" ")
issue.summary = issueSummary.strip(" ")
issue.title = issueTitle
issue.id = issueId
# we must go deeper to get comments
issueComments = BeautifulSoup(self.curl_get(issueURL).getvalue(), "html.parser")
for comment in issueComments.find_all("div", "vt"):
#author = comment.find(class_="author").find("a").text
author = (comment.find(class_="author").find_all("a")[-1]).contents
date = comment.find("span", "date")["title"]
commentText = comment.find("pre").get_text()
issueComment = IssueComment()
issueComment.date = date
issueComment.author = author
issueComment.summary = commentText
issue.comments.append(issueComment)
project.issues.append(issue)
# get wiki pages
project.wikis = []
wikiSoup = BeautifulSoup(self.curl_get(projectURL + "w/list").getvalue(), "html.parser")
if "Your search did not generate any results." not in wikiSoup.get_text():
wikiSection = wikiSoup.find("table", "results")
for wikiRow in wikiSection.find_all("tr")[1:]:
wiki = Wiki()
cols = wikiRow.find_all("td")
wiki.pageName = cols[1].text.replace("\n", "").strip(" ")
wiki.summary = cols[2].text.replace("\n", "").strip(" ")
wiki.updated = cols[3].text.replace("\n", "").strip(" ")
wikiURL = projectURL + "wiki/" + wiki.pageName
wikiPageSoup = BeautifulSoup(self.curl_get(wikiURL).getvalue(), "html.parser")
wikiContent = wikiPageSoup.find(id="wikicontent")
wiki.htmlContent = wikiContent.prettify()
wiki.textContent = wikiContent.get_text()
project.wikis.append(wiki)
return project