更复杂格式的数据(xml&html)

1.解析xml

import xml.etree.ElementTree as ET
import pprint

tree = ET.parse('exampleResearchArticle.xml') 
root = tree.getroot()

print "Children of root:"
for child in root:
    print child.tag                #使用tag属性打印出每个子元素的标签名
    
title = root.find('./fm/bibl/title')                    #xpath表达式
title_text=''
for p in title:   
    title_text += p.text
    print "\nTitle:\n",title_text
    
print "\nAuthor email addresses:"
for a in root.findall('./fm/bibl/aug/au'):
    email = a.find('email')
    if email is not None:
        print email.text

2.提取数据(xml)

从 xml 中提取关于文章作者的数据,并将其添加到列表中,一个作者对应一个条目。
如下是我们期望的格式。名字、姓氏和电子邮箱标签应该直接对应字典关键字

olution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'}, {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'}, {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'}, {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'}, {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'}, {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'}, {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'}, {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
import xml.etree.ElementTree as ET
article_file = "exampleResearchArticle.xml"
def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()
root = get_root(article_file)
def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": author.find('./fnm').text,
                "snm": author.find('./snm').text,
                "email": author.find('./email').text
        }

        authors.append(data)

    return authors
get_authors(root)

3.处理属性(xml)

从"insr"标签中提取属性iid的值,并将其添加到字典关键字“insr”列表中
<insr iid="I2"/>

import xml.etree.ElementTree as ET

article_file = "exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()

root = get_root(article_file)

def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": author.find('./fnm').text,
                "snm": author.find('./snm').text,
                "email": author.find('./email').text,
                "insr": []
        }
        
        insr = author.findall('./insr')
        for i in insr:
            data['insr'].append(i.attrib['iid'])


        authors.append(data)

    return authors

4.提取实体(html)

from bs4 import BeautifulSoup

def options(soup,id):
    option_values = []
    carrier_list = soup.find(id = id)
    for option in carrier_list.find_all('option'):
        option_values.append(option['value'])
    return option_values

def print_list(label,codes):
    print "\n%s:" % label
    for c in codes:
        print c
        
def main():
    soup = BeautifulSoup(open('virgin_and_logan_airport.html'))
    codes = options(soup,'CarrierList')
    print_list('Carriers',codes)
    
    codes = options(soup,'AirportList')
    print_list('Airports',codes)

5.使用BeautifulSoup(html)

使用 BeautifulSoup 处理 HTML,提取出"__EVENTVALIDATION”和“__VIEWSTATE”的隐藏表格字段值,并在数据字典中设置相应的值。

from bs4 import BeautifulSoup
import requests
import json

html_page = "page_source.html"

def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html,'lxml')
        ev = soup.find(id = '__EVENTVALIDATION')
        data['eventvalidation'] = ev['value']
        
        vs = soup.find(id="__VIEWSTATE")
        data['viewstate'] = vs['value']
        # do something here to find the necessary values


    return data

def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text

6.抓取解法

from bs4 import BeautifulSoup
import requests
import json

s = requests.Session()

r=s.get('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2')
soup = BeautifulSoup(r.text)

viewstate_element = soup.find(id='__VIEWSTATE')
viewstate = viewstate_element['value']

eventvalidation_element = soup.find(id='__EVENTVALIDATION')
eventvalidation= eventvalidation_element['value']

r=s.post('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2',
         data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

f = open('virgin_and_logan_airport.html','w')
f.write(r.text)

7.习题集

7.1运营商列表(html)

获取一个包含所有航空公司的列表。在你所返回的数据中要去掉所有类似 “All U.S. Carriers” 的组合。最终你应该返回一个含有运营商编码的列表。

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrier_list = soup.find(id="CarrierList")
        carriers = carrier_list.find_all('option')
        
        for carrier in carriers:
            if len(carrier['value'])==2:
                data.append(carrier['value'])
        
    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text

7.2机场列表(html)

其返回机场代码列表,并删除任何组合内容,例如“All”。

from bs4 import BeautifulSoup
html_page = "options.html"
def extract_airports(page):
    data = []
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        airport_list = soup.find(id='AirportList')
        airports = airport_list.find_all('option')
        
        for airport in airports:
            if 'All' not in airport['value']:
                data.append(airport['value'])

    return data

7.3处理所有数据(html)

Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

The table with flight info has a table class="dataTDRight". Your task is to
use 'process_file()' to extract the flight data from that table as a list of
dictionaries, each dictionary containing relevant data from the file and table
row. This is an example of the data structure you should return:

data = [{"courier": "FL",
"airport": "ATL",
"year": 2012,
"month": 12,
"flights": {"domestic": 100,
"international": 100}
},
{"courier": "..."}
]
Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.

from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

datadir = "data"


def open_zip(datadir):
    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
        myzip.extractall()


def process_all(datadir):
    files = os.listdir(datadir)
    return files


def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]

    Note: create a new dictionary for each entry in the output data list.
    If you use the info dictionary defined here each element in the list 
    will be a reference to the same info dictionary.
    """
  
    with open("{}/{}".format(datadir, f), "r") as html:
        data = []
        info = {"courier": "",
                "airport": "",
                "year": "",
                "month": "",
                "flights": {"domestic": "",
                         "international": ""}
            }
        info["courier"], info["airport"] = f[:6].split("-")

        soup = BeautifulSoup(html,'lxml')
        table_data = soup.find('table',{'class':'dataTDRight'})
        for tr in table_data.find_all('tr'):
            td = tr.find_all('td')
            
            if td[1].string =='Month' or td[1].string =='TOTAL':
                continue
            else:
                info['year'] = int(td[0].string)
                info['month'] = int(td[1].string)
                info['flights']['domestic']= int(td[2].string.replace(',',''))
                info['flights']['international']= int(td[3].string.replace(',',''))
                
            data.append(info)
        

    return data

7.4
xml文件不是有效的 XML,因为它有几个根元素和 XML 声明,是多个相连的 XML 文档构成的。
本题需要将文件拆分为多个文档,并将这些文档处理为有效的 XML 文档。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

import xml.etree.ElementTree as ET
PATENTS = 'patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):
    with open(filename) as infile:
        n = -1 
        outfile = open('{}-{}'.format(filename, n), 'w') 
        for line in infile:
            if line.startswith('<?xml'):
                outfile.close() 
                n += 1       
                outfile = open('{}-{}'.format(filename, n),'w')

            outfile.write(line)
            
        outfile.close() 
    """
    Split the input file into separate files, each containing a single patent.
    As a hint - each patent declaration starts with the same line that was
    causing the error found in the previous exercises.
    
    The new files should be saved with filename in the following format:
    "{}-{}".format(filename, n) where n is a counter, starting from 0.
    """

def test():
    split_file(PATENTS)
    for n in range(4):
        try:
            fname = "{}-{}".format(PATENTS, n)
            f = open(fname, "r")
            if not f.readline().startswith("<?xml"):
                print "You have not split the file {} in the correct boundary!".format(fname)
            f.close()
        except:
            print "Could not find file {}. Check if the filename is correct!".format(fname)


test()
最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 158,847评论 4 362
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 67,208评论 1 292
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 108,587评论 0 243
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 43,942评论 0 205
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 52,332评论 3 287
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 40,587评论 1 218
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 31,853评论 2 312
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 30,568评论 0 198
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 34,273评论 1 242
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 30,542评论 2 246
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 32,033评论 1 260
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 28,373评论 2 253
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 33,031评论 3 236
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 26,073评论 0 8
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 26,830评论 0 195
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 35,628评论 2 274
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 35,537评论 2 269

推荐阅读更多精彩内容

  • 1. Java基础部分 基础部分的顺序:基本语法,类相关的语法,内部类的语法,继承相关的语法,异常的语法,线程的语...
    子非鱼_t_阅读 31,293评论 18 399
  • Spring Cloud为开发人员提供了快速构建分布式系统中一些常见模式的工具(例如配置管理,服务发现,断路器,智...
    卡卡罗2017阅读 134,100评论 18 139
  • Spark SQL, DataFrames and Datasets Guide Overview SQL Dat...
    Joyyx阅读 8,282评论 0 16
  • 这个周末呢,我回了趟家。周四周五两天期中考试的,所以周五放学就挺早。但是天气不好,下着还挺大的雨。我这个人呢,虽然...
    九景阅读 159评论 0 0
  • 今天天气不怎么样。灰蒙蒙的天空夹带着毛毛细雨,偶尔刮一下湿润的风,再加上那让人不想出门的寒冷,一大早就给人压抑的感...
    56a79d19d48f阅读 180评论 0 0