A temporary version of your scraper has been saved.
Sign in to save permanently
Discard this scraper
This scraper is protected: You can only watch other people editing.
Make my own copy of this scraper
You are in read-only mode.
Sign in to edit this scraper
Copy as a guest
You are viewing an earlier version of this scraper. If you save, the newest version will be overwritten!
is currently editing this scraper.
Back to scraper overview
import lxml.etree import urllib2 request = urllib2.Request('http://wiadomosci.onet.pl/kraj/rss.xml') response = urllib2.urlopen(request).read() print [response] # ['<?xml version="1.0" encoding="utf-8"?>\n<feed xmlns=... <title>Wiadomo\xc5\x9bci...'] uresponse = response.decode("utf8") print [uresponse] # [u'<?xml version="1.0" encoding="utf-8"?>\n<feed xmlns=... <title>Wiadomo\u015bci...'] tree = lxml.etree.fromstring(response) res = lxml.etree.tostring(tree) print [res] # ['<feed xmlns="http://www.w3.org/2005/Atom">\n<title>Wiadomości...'] lres = lxml.etree.tostring(tree, encoding="latin1") print [lres] # ["<?xml version='1.0' encoding='latin1'?>\n<feed xmlns=...<title>Wiadomości...'] # works because the 38 character encoding declaration is sliced off print lxml.etree.fromstring(uresponse[38:]) # throws ValueError(u'Unicode strings with encoding declaration are not supported.',) print lxml.etree.fromstring(uresponse)
You are editing,