import re import urllib2 import bs4 _STORYID_REGEX = r"var\s+storyid\s*=\s*(\d+);" _CHAPTER_REGEX = r"var\s+chapter\s*=\s*(\d+);" _CHAPTERS_REGEX = r"var\s+chapters\s*=\s*(\d+);" _WORDS_REGEX = r"var\s+words\s*=\s*(\d+);" _USERID_REGEX = r"var\s+userid\s*=\s*(\d+);" _TITLE_REGEX = r"var\s+title\s*=\s*'(.+)';" _TITLE_T_REGEX = r"var\s+title_t\s*=\s*'(.+)';" _SUMMARY_REGEX = r"var\s+summary\s*=\s*'(.+)';" _CATEGORYID_REGEX = r"var\s+categoryid\s*=\s*(\d+);" _CAT_TITLE_REGEX = r"var\s+cat_title\s*=\s*'(.+)';" _DATEP_REGEX = r"var\s+datep\s*=\s*'(.+)';" _DATEU_REGEX = r"var\s+dateu\s*=\s*'(.+)';" _AUTHOR_REGEX = r"var\s+author\s*=\s*'(.+)';" # Useful for generating a review URL later on _STORYTEXTID_REGEX = r"var\s+review_url\s*=\s*'.*storytextid=(\d+)';" # Used to parse the attributes which aren't directly contained in the # JavaScript and hence need to be parsed manually _NON_JAVASCRIPT_REGEX = r'Rated:(.+)' _HTML_TAG_REGEX = r'<.*?>' # Needed to properly decide if a token contains a genre or a character name # while manually parsing data that isn't directly contained in the JavaScript _GENRES = [ 'General', 'Romance', 'Humor', 'Drama', 'Poetry', 'Adventure', 'Mystery', 'Horror', 'Parody', 'Angst', 'Supernatural', 'Suspense', 'Sci-Fi', 'Fantasy', 'Spiritual', 'Tragedy', 'Western', 'Crime', 'Family', 'Hurt', 'Comfort', 'Friendship' ] _CHAPTER_URL_TEMPLATE = 'http://www.fanfiction.net/s/%d/%d' def _parse_string(regex, source): """Returns first group of matched regular expression as string.""" return re.search(regex, source).group(1).decode('utf-8') def _parse_integer(regex, source): """Returns first group of matched regular expression as integer.""" return int(re.search(regex, source).group(1)) def _unescape_javascript_string(string_): """Removes JavaScript-specific string escaping characters.""" return string_.replace("\\'", "'").replace('\\"', '"').replace('\\\\', '\\') class Story(object): def __init__(self, url, opener=urllib2.urlopen): source = opener(url).read() # Easily parsable and directly contained in the JavaScript, lets hope # that doesn't change or it turns into something like below self.id = _parse_integer(_STORYID_REGEX, source) self.number_chapters = _parse_integer(_CHAPTERS_REGEX, source) self.number_words = _parse_integer(_WORDS_REGEX, source) self.author_id = _parse_integer(_USERID_REGEX, source) self.title = _unescape_javascript_string(_parse_string(_TITLE_T_REGEX, source)) self.summary = _unescape_javascript_string(_parse_string(_SUMMARY_REGEX, source)) self.category_id = _parse_integer(_CATEGORYID_REGEX, source) self.category = _unescape_javascript_string(_parse_string(_CAT_TITLE_REGEX, source)) self.date_published = _parse_string(_DATEP_REGEX, source) self.date_updated = _parse_string(_DATEU_REGEX, source) self.author = _unescape_javascript_string(_parse_string(_AUTHOR_REGEX, source)) # Tokens of information that aren't directly contained in the # JavaScript, need to manually parse and filter those tokens = [token.strip() for token in re.sub(_HTML_TAG_REGEX, '', _parse_string(_NON_JAVASCRIPT_REGEX, source)).split('-')] # Both tokens are constant and always available self.rated = tokens[0] self.language = tokens[1] # After those the remaining tokens are uninteresting and looking for # either character or genre tokens is useless token_terminators = ['Reviews: ', 'Updated: ', 'Published: '] # Check if tokens[2] contains the genre if tokens[2] in _GENRES or '/' in tokens[2] and all(token in _GENRES for token in tokens[2].split('/')): self.genre = tokens[2] # tokens[2] contained the genre, check if next token contains the # characters if not any(tokens[3].startswith(terminator) for terminator in token_terminators): self.characters = tokens[3] else: # No characters token self.characters = '' elif any(tokens[2].startswith(terminator) for terminator in token_terminators): # No genre and/or character was specified self.genre = '' self.characters = '' # tokens[2] must contain the characters since it wasn't a genre # (check first clause) but isn't either of "Reviews: ", "Updated: " # or "Published: " (check previous clause) else: self.characters = tokens[2] for token in tokens: if token.startswith('Reviews: '): # Replace comma in case the review count is greater than 9999 self.reviews = int(token.split()[1].replace(',', '')) break else: # "Reviews: " wasn't found and for-loop not broken, hence no (0) # reviews self.reviews = 0 # Status is directly contained in the tokens as a single-string if 'Complete' in tokens: self.status = 'Complete' else: # FanFiction.Net calls it "In-Progress", I'll just go with that self.status = 'In-Progress' def get_chapters(self, opener=urllib2.urlopen): for number in range(1, self.number_chapters + 1): url = _CHAPTER_URL_TEMPLATE % (self.id, number) yield Chapter(url, opener) # Method alias which allows the user to treat the get_chapters method like # a normal property if no manual opener is to be specified. chapters = property(get_chapters) class Chapter(object): def __init__(self, url, opener=urllib2.urlopen): source = opener(url).read() self.story_id = _parse_integer(_STORYID_REGEX, source) self.number = _parse_integer(_CHAPTER_REGEX, source) self.story_text_id = _parse_integer(_STORYTEXTID_REGEX, source) soup = bs4.BeautifulSoup(source, 'html5lib') select = soup.find('select', {'name': 'chapter'}) if select: # There are multiple chapters available, use chapter's title self.title = select.find('option', selected=True).string.split(None, 1)[1] else: # No multiple chapters, one-shot or only a single chapter released # until now; for the lack of a proper chapter title use the story's self.title = _unescape_javascript_string(_parse_string(_TITLE_T_REGEX, source)).decode() soup = soup.find('div', id='storytext') # Remove AddToAny share buttons soup.find('div', {'class': lambda class_: class_ and 'a2a_kit' in class_}).extract() # Normalize HTML tag attributes for hr in soup('hr'): del hr['size'] del hr['noshade'] self.text = soup.decode()
Run
Reset
Share
Import
Link
Embed
Language▼
English
中文
Python Fiddle
Python Cloud IDE
Follow @python_fiddle
Browser Version Not Supported
Due to Python Fiddle's reliance on advanced JavaScript techniques, older browsers might have problems running it correctly. Please download the latest version of your favourite browser.
Chrome 10+
Firefox 4+
Safari 5+
IE 10+
Let me try anyway!
url:
Go
Python Snippet
Stackoverflow Question